In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from matplotlib.dates import date2num
import geopandas as gpd
from shapely.geometry import Point,Polygon
import plotly.graph_objects as go
import plotly.express as px
import plotly_express as px
import squarify

# PCA & MDS #
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from scipy.spatial.distance import pdist, cdist
In [40]:
df_gender=pd.read_csv("data on gender on england population.csv")
In [41]:
df_gender = df_gender.iloc[1: , :]
In [42]:
df_gender = df_gender.drop(['CDU_ID','GEO_TYP2'], axis=1)
In [222]:
df_gender
Out[222]:
GEO_CODE GEO_LABEL GEO_TYPE Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons
1 E92000001 England Countries and Groupings 53012456.0 26069148.0 26943308.0
2 E12000001 North East Regions 2596886.0 1269703.0 1327183.0
3 E12000002 North West Regions 7052177.0 3464685.0 3587492.0
4 E12000003 Yorkshire and The Humber Regions 5283733.0 2598078.0 2685655.0
5 E12000004 East Midlands Regions 4533222.0 2234493.0 2298729.0
... ... ... ... ... ... ...
219050 E00176770 E00176770 Output Areas and Small Areas 952.0 878.0 74.0
219051 E00176771 E00176771 Output Areas and Small Areas 621.0 188.0 433.0
219052 E00176772 E00176772 Output Areas and Small Areas 256.0 131.0 125.0
219053 E00176773 E00176773 Output Areas and Small Areas 169.0 95.0 74.0
219054 E00176774 E00176774 Output Areas and Small Areas 102.0 53.0 49.0

219054 rows × 6 columns

In [520]:
df_gender['GEO_CODE']=='Shropshire'
Out[520]:
1         False
2         False
3         False
4         False
5         False
          ...  
219050    False
219051    False
219052    False
219053    False
219054    False
Name: GEO_CODE, Length: 219054, dtype: bool
In [1089]:
Key = df_gender[df_gender['GEO_LABEL']=='Bedfordshire']
In [1095]:
Key = df_gender[df_gender['GEO_CODE']=='E06000055']

Berkshire = "West Berkshire" BRISTOL = "Bristol, City of" Cheshire = "Cheshire East" Cornwall = "Cornwall, Isles of Scilly" Derbyshire = "Derbyshire Dales" Devon = "Mid Devon" Durham = "County Durham" Herefordshire = "Herefordshire, County of" Kingston upon Hull = ""Kingston upon Hull, City of" Westminster = "City of London, Westminster" Stockton-on-Trent = "Stoke-on-Trent" Bedfordshire = "Central Bedfordshire" London = "City of London 001"

Bedfordshire = "Central Bedfordshire"

254,381 Central Bedfordshire E06000056

157,479 Bedford E06000055

In [1096]:
Key
Out[1096]:
GEO_CODE GEO_LABEL GEO_TYPE Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons
98 E06000055 Bedford Local Authorities 157479.0 77312.0 80167.0
In [614]:
["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
Out[614]:
['E09000002',
 'E06000022',
 'E06000055',
 'E06000037',
 'E09000004',
 'E06000008',
 'E06000028',
 'E09000005',
 'E06000043',
 'E06000023',
 'E09000006',
 'E10000002',
 'E10000003',
 'E09000007',
 'E06000049',
 'E41000052',
 'E09000008',
 'E10000006',
 'E06000005',
 'E07000035',
 'E06000015',
 'E07000042',
 'E10000009',
 'E06000047',
 'E09000009',
 'E06000011',
 'E10000011',
 'E09000010',
 'E10000012',
 'E10000013',
 'E09000011',
 'E09000012',
 'E06000006',
 'E09000013',
 'E10000014',
 'E09000014',
 'E09000015',
 'E06000001',
 'E09000016',
 'E06000019',
 'E10000015',
 'E09000017',
 'E09000018',
 'E06000046',
 'E09000019',
 'E09000020',
 'E10000016',
 'E06000010',
 'E09000021',
 'E09000022',
 'E10000017',
 'E06000016',
 'E10000018',
 'E09000023',
 'E10000019',
 'E02000001',
 'E06000032',
 'E08000003',
 'E06000035',
 'E11000002',
 'E09000024',
 'E06000002',
 'E06000042',
 'E09000025',
 'E10000020',
 'E06000012',
 'E06000013',
 'E06000024',
 'E10000023',
 'E10000021',
 'E06000048',
 'E06000018',
 'E10000024',
 'E10000025',
 'E06000031',
 'E06000026',
 'E06000029',
 'E06000044',
 'E09000026',
 'E06000003',
 'E09000027',
 'E06000017',
 'E06000051',
 'E10000027',
 'E06000025',
 'E11000003',
 'E06000045',
 'E06000033',
 'E09000028',
 'E10000028',
 'E06000021',
 'E06000004',
 'E10000029',
 'E10000030',
 'E09000029',
 'E06000030',
 'E06000020',
 'E06000034',
 'E06000027',
 'E09000030',
 'E11000004',
 'E09000031',
 'E09000032',
 'E06000007',
 'E10000031',
 'E11000005',
 'E10000032',
 'E11000006',
 'E41000324',
 'E06000054',
 'E10000034',
 'E06000014']
In [ ]:
"E09000002",

Geocodes

Barking and Dagenham - E09000002

Bath and north east somerset - E06000022

Bedfordshire - E06000056

Berkshire - E06000037

Bexley - E09000004

Blackburn with Darwen - E06000008

Bournemouth - E06000028

Brent - E09000005

Brighton and Hove - E06000043

Bristol - E06000023

Bromley - E09000006

Buckinghamshire - E10000002

Cambridgeshire - E07000009

Camden - E09000007

Cheshire - E06000049

Cornwall - E06000052

Croydon - E09000008

Cumbria - E10000006

Darlington - E06000005

Derbyshire - E07000035

Derby - E06000015

Devon - E07000042

Dorset - E07000049

Durham - E06000047

Eailing- E09000009

East riding of Yorkshire - E06000011

East sussex - E07000228

Enfield - E09000010

Essex - E10000012

Gloucestershire - E06000025

Greenwich - E09000011

Hackney - E09000012

Halton - E06000006

Hammersmith and Fulham - E09000013

Hampshire - E07000085

Haringey - E09000014

Harrow - E09000015

Hartlepool - E06000001

Havering - E09000016

Herefordshire - E06000019

Hertfordshire - E07000099

Hillingdon - E09000017

Hounslow - E09000018

Isle of Wight - E06000046

Islington - E09000019

Kinsington and Chelsea - E09000020

Kent - E10000016

Kingston upon Hull - E06000010

Lambeth - E09000022

Lancashire - E07000127

Leicestershire - E07000134

Leicester - E06000016

Lewisham - E09000023

Lincolnshire - E06000013

City of London - E09000001

Luton - E06000032

Manchester - E08000003

Medway - E06000035

Merseyside - E11000002

Merton - E09000024

Middlesbrough - E06000002

Milton Keynes - E06000042

Newham - E09000025

Norfolk - E10000020

North East Lincolnshire - E06000012

North Somerset - E06000024

North Yorkshire - E10000023

Northamptonshire - E10000021

Northumberland - E06000048

Nottinghamshire - E10000024

Nottingham - E06000018

Oxfordshire - E10000025

Peterborough - E06000031

Plymouth - E06000026

Poole - E06000029

Portsmouth - E06000044

Redbridge - E09000026

Redcar and cleveland - E06000003

Richmond Upon Thames - E09000027

Rutland - E06000017

Shropshire - E06000051

Somerset - E10000027

South Gloucestershire - E06000025

South Yorkshire - E11000003

Southampton - E06000045

Southend-on-sea - E06000033

Southwark - E09000028

Staffordshire - E10000028

Stockton-on-tees - E06000004

Stoke-on-trent - E06000021

Suffolk - E10000029

Surrey - E10000030

Sutton - E09000029

Swindon - E06000030

Telford and wrekin - E06000020

Thurrock - E06000034

Torbay - E06000027

Tower Hamlets - E09000030

Tyne and Wear - E11000004

Waltham Forest - E09000031

Wandsworth - E09000032

Warrington - E06000007

Warwichshire - E10000031

West Midlands - E11000005

West Sussex - E10000032

West Yorkshire - E11000006

Westminster - E09000033

Wiltshire - E06000054

Worcesstershire - E10000034

York - E06000014

Kingston upon Thames - E09000021

Milton Keynes - E06000042

Colate all codes which corresponds to our shape files visualised on QGIS

E09000002,E06000022,E06000056,E06000037,E09000004,E06000008,E06000028,E09000005,E06000043,E06000023,E09000006,E10000002,E07000009,E09000007,E06000049,E06000052,E09000008,E10000006,E06000005,E07000035,E06000015,E07000042,E07000049,E06000047,E09000009,E06000011,E07000228,E09000010,E10000012,E06000025,E09000011,E09000012,E06000006,E09000013,E07000085,E09000014,E09000015,E06000001,E09000016,E06000019,E07000099,E09000017,E09000018,E06000046,E09000019,E09000020,E10000016,E06000010,E09000022,E07000127,E07000134,E06000016,E09000023,E06000013,E09000001,E06000032,E08000003,E06000035,E11000002,E09000024,E06000002,E06000042,E09000025,E10000020,E06000012,E06000024,E10000023,E10000021,E06000048,E10000024,E06000018,E10000025,E06000031,E06000026,E06000029,E06000044,E09000026,E06000003,E09000027,E06000017,E06000051,E10000027,E06000025,E11000003,E06000045,E06000033,E09000028,E10000028,E06000004,E06000021,E10000029,E10000030,E09000029,E06000030,E06000020,E06000034,E06000027,E09000030,E11000004,E09000031,E09000032,E06000007,E10000031,E11000005,E10000032,E11000006,E09000033,E06000054,E06000014,E09000021,E10000034

Filter Dataframe

In [615]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [616]:
filtered_Gender = df_gender[df_gender['GEO_CODE'].isin(GeoCodes)]
print(filtered_Gender )
       GEO_CODE                    GEO_LABEL  \
11    E10000002              Buckinghamshire   
12    E10000003               Cambridgeshire   
13    E10000006                      Cumbria   
16    E10000009                       Dorset   
17    E10000011                  East Sussex   
...         ...                          ...   
366   E09000031               Waltham Forest   
367   E09000032                   Wandsworth   
368   E41000052    Cornwall, Isles of Scilly   
369   E41000324  City of London, Westminster   
8048  E02000001           City of London 001   

                                              GEO_TYPE  \
11                                            Counties   
12                                            Counties   
13                                            Counties   
16                                            Counties   
17                                            Counties   
...                                                ...   
366                                  Local Authorities   
367                                  Local Authorities   
368                                  Local Authorities   
369                                  Local Authorities   
8048  Middle Super Output Areas and Intermediate Zones   

      Sex : Total\ Sex - Unit : Persons  Sex : Males - Unit : Persons  \
11                             505283.0                      248346.0   
12                             621210.0                      309560.0   
13                             499858.0                      246065.0   
16                             412905.0                      201271.0   
17                             526671.0                      253764.0   
...                                 ...                           ...   
366                            258249.0                      128970.0   
367                            306995.0                      148646.0   
368                            534476.0                      258907.0   
369                            226771.0                      115639.0   
8048                             7375.0                        4091.0   

      Sex : Females - Unit : Persons  
11                          256937.0  
12                          311650.0  
13                          253793.0  
16                          211634.0  
17                          272907.0  
...                              ...  
366                         129279.0  
367                         158349.0  
368                         275569.0  
369                         111132.0  
8048                          3284.0  

[112 rows x 6 columns]
In [618]:
filtered_Gender.head()
Out[618]:
GEO_CODE GEO_LABEL GEO_TYPE Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons
11 E10000002 Buckinghamshire Counties 505283.0 248346.0 256937.0
12 E10000003 Cambridgeshire Counties 621210.0 309560.0 311650.0
13 E10000006 Cumbria Counties 499858.0 246065.0 253793.0
16 E10000009 Dorset Counties 412905.0 201271.0 211634.0
17 E10000011 East Sussex Counties 526671.0 253764.0 272907.0

Change name of countes to match shape files on QGIS

  • Berkshire = "West Berkshire"
  • BRISTOL = "Bristol, City of"
  • Cheshire = "Cheshire East"
  • Cornwall = "Cornwall, Isles of Scilly"
  • Derbyshire = "Derbyshire Dales"
  • Devon = "Mid Devon"
  • Durham = "County Durham"
  • Herefordshire = "Herefordshire, County of"
  • Kingston upon Hull = ""Kingston upon Hull, City of"
  • City of London = "City of London 001"
  • Westminster = "City of London, Westminster"
  • Stockton-on-Trent = "Stoke-on-Trent"
In [644]:
#change one name using key slice
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
In [1126]:
#Check if change of name works
Key = filtered_Gender[filtered_Gender['GEO_CODE']=='E06000021']
In [ ]:
 
In [1127]:
Key
Out[1127]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons
66 E06000021 Stockton-on-Trent 249008.0 123995.0 125013.0
In [ ]:
 
In [1113]:
#Change name of countes to match shape files on QGIS

filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000056', 'GEO_LABEL'] = "Bedfordshire"
In [1003]:
filtered_Gender = filtered_Gender.drop(['GEO_TYPE'], axis=1)
In [1004]:
filtered_Gender.head()
Out[1004]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons
11 E10000002 Buckinghamshire 505283.0 248346.0 256937.0
12 E10000003 Cambridgeshire 621210.0 309560.0 311650.0
13 E10000006 Cumbria 499858.0 246065.0 253793.0
16 E10000009 Dorset 412905.0 201271.0 211634.0
17 E10000011 East Sussex 526671.0 253764.0 272907.0
In [960]:
# Transform the data into a numpy ndarray
np_array_gender = filtered_Gender.values
# Removing the first four columns
np_array_gender = np_array_gender[:, 3:]

# Building the PCA model
pca_gender = PCA(n_components=2).fit(np_array_gender) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_g = pca_gender.transform(np_array_gender) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_v variable has a shape of :", pca_x_g.shape)
print(pca_x_g)
The pca_x_v variable has a shape of : (112, 2)
[[ 9.74284592e+04 -5.80259405e+01]
 [ 2.39348529e+05  6.14111376e+03]
 [ 9.07777539e+04  4.77175176e+02]
 [-1.56896731e+04 -2.58793814e+03]
 [ 1.23705850e+05 -7.22328172e+03]
 [ 1.18550185e+06 -5.22448622e+03]
 [ 2.09757342e+05 -1.06783160e+03]
 [ 1.09263061e+06 -2.48740166e+03]
 [ 8.45535014e+05 -9.84121409e+02]
 [ 1.27140991e+06 -3.72371412e+03]
 [ 9.13224579e+05  3.42121172e+02]
 [ 2.75249980e+05  2.60964296e+03]
 [ 3.52690921e+05 -4.14356953e+03]
 [ 5.29323762e+05 -1.53104000e+03]
 [ 3.26041200e+05  2.04373545e+03]
 [ 2.11448861e+05  1.22305127e+02]
 [ 4.41000846e+05  1.15626185e+03]
 [ 2.79303991e+05  2.51468005e+03]
 [ 1.27700884e+05 -2.96144079e+03]
 [ 5.17741967e+05  4.64248715e+03]
 [ 3.70382192e+05  3.02439764e+03]
 [ 8.65541095e+05 -1.61961046e+03]
 [ 1.46638917e+05  1.39482070e+03]
 [ 4.66936016e+05 -8.02682521e+03]
 [ 1.71993493e+05  7.86467516e+02]
 [ 1.17036517e+06 -9.53642507e+03]
 [ 1.12418024e+06  3.42339752e+03]
 [ 8.31787727e+05 -2.40580505e+03]
 [ 2.83014809e+06  6.76674720e+03]
 [ 2.20505149e+06  1.85353861e+03]
 [-4.08719647e+05 -1.48237849e+03]
 [-3.51911332e+05 -1.13956354e+03]
 [-3.55865795e+05 -1.83407271e+03]
 [-2.86757530e+05 -7.56291105e+02]
 [-3.92141982e+05 -1.34617036e+03]
 [-3.67422468e+05 -1.35849140e+03]
 [-2.73768527e+05  6.75926383e+02]
 [-3.40815407e+05  7.91042118e+02]
 [-2.07434983e+05  2.88282402e+03]
 [-1.12126420e+05 -1.58331406e+03]
 [-3.25942107e+05 -9.69476148e+02]
 [-3.16360324e+05 -2.09326244e+02]
 [-2.78852714e+05 -2.14996160e+03]
 [-2.16783890e+05  5.89368981e+02]
 [-1.17468067e+05  7.12621118e+02]
 [-4.75682553e+05 -1.25970579e+02]
 [-1.47102241e+05  4.58213821e+03]
 [-2.96722319e+05 -4.63572268e+02]
 [-3.17351425e+05  2.44085300e+02]
 [-2.16483480e+05  1.75390320e+03]
 [-3.05851585e+05 -1.27306501e+03]
 [ 3.01723366e+03  3.47207185e+03]
 [-2.73322569e+05 -2.15613293e+03]
 [-1.99626098e+05  1.30703983e+03]
 [-2.07434391e+05  4.43156905e+02]
 [-3.61024480e+05 -2.50893859e+03]
 [-2.96722365e+05  1.05963225e+03]
 [-3.40600352e+05 -1.33364311e+03]
 [-2.65297667e+05  1.97935790e+03]
 [-2.96541382e+05  2.17335840e+02]
 [-2.72595562e+05  2.34036593e+03]
 [-3.08741648e+05 -1.10202733e+03]
 [-3.28290930e+05 -2.44286467e+02]
 [-1.98205092e+05  1.07133361e+03]
 [-3.33053326e+05  3.23543667e+02]
 [-2.16699954e+05  6.40524194e+02]
 [-1.86648293e+05  1.99520719e+03]
 [-2.70328905e+05  2.81782120e+03]
 [-2.31358069e+05  3.73715974e+03]
 [-3.52087615e+05 -1.47320757e+03]
 [ 1.07192271e+05 -1.42646981e+03]
 [-1.34352339e+05 -2.10078895e+03]
 [-6.81028461e+04 -1.01991860e+03]
 [-1.46514128e+05  1.20081222e+03]
 [ 5.54110139e+04  2.58291111e+02]
 [-3.28561351e+05 -8.10231750e+02]
 [-4.34342267e+05 -6.74098314e+02]
 [-4.26212751e+05 -1.03208365e+03]
 [ 9.47026002e+04  7.48480155e+03]
 [-2.93721016e+05 -2.24283186e+03]
 [-2.37255416e+05 -3.87401036e+03]
 [-1.40322489e+05  4.55047152e+03]
 [-1.42443824e+05 -5.32904840e+03]
 [-2.51566346e+05 -1.15248929e+03]
 [-7.63392232e+04 -3.67348480e+03]
 [-1.06955386e+05  3.64009175e+03]
 [-1.38687642e+05 -4.53848326e+03]
 [-2.09679096e+05  1.05920614e+03]
 [-2.19827088e+05  8.71822403e+02]
 [-2.97913089e+05 -1.74422935e+03]
 [-2.09224249e+05  8.03401276e+02]
 [-2.28655466e+05  2.07863508e+02]
 [-2.30842936e+05 -3.92113712e+03]
 [-1.85944641e+05  1.18214651e+03]
 [-2.10437409e+05  3.13391585e+03]
 [-2.68982536e+05 -4.63016106e+02]
 [-3.27133232e+05 -3.74063436e+02]
 [-3.25392513e+05 -1.48090765e+03]
 [-1.50254142e+05  2.34166122e+03]
 [-1.83528579e+05 -1.37663603e+03]
 [-2.76864797e+05 -9.09254055e+01]
 [-1.44366916e+05  1.22601535e+04]
 [-1.79775598e+05  8.91133145e+02]
 [-2.92405546e+05 -1.70115518e+03]
 [-1.68368496e+05  8.62197793e+02]
 [-2.88535965e+05 -2.05913889e+03]
 [-2.10322082e+05  7.99971503e+03]
 [-2.05171992e+05  2.38295409e+03]
 [-1.45399407e+05 -3.58533748e+03]
 [ 1.33244574e+05 -5.36116816e+03]
 [-2.43760492e+05  5.35302337e+03]
 [-5.12417972e+05 -2.95953015e+02]]
In [961]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs 'Gender' ")
plt.scatter(pca_x_g[:,0], pca_x_g[:,1],cmap='viridis')
Out[961]:
<matplotlib.collections.PathCollection at 0x7f97624b59d0>
In [ ]:
 
In [962]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_gender.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_Gender.columns[3+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_gender.components_[1])[np.argsort(np.abs(pca_gender.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_Gender.columns[3+np.argsort(np.abs(pca_gender.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(3):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(3):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Sex : Total\ Sex - Unit : Persons" has a loading of : 0.8164445704535446
The Column "Sex : Females - Unit : Persons" has a loading of : 0.41620335863174274
The Column "Sex : Males - Unit : Persons" has a loading of : 0.40024121182180183

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Sex : Males - Unit : Persons" has a loading of : 0.7116696136734258
The Column "Sex : Females - Unit : Persons" has a loading of : -0.702453863915861
The Column "Sex : Total\ Sex - Unit : Persons" has a loading of : 0.00921574975756316
In [655]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Sex : Males - Unit : Persons', data=filtered_Gender)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [656]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Sex : Females - Unit : Persons', data=filtered_Gender)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [55]:
filtered_Gender = filtered_Gender.drop(['Sex : Total\ Sex - Unit : Persons'], axis=1)
In [660]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)

gender_num = filtered_Gender.select_dtypes(include='number') # selects numeric columns only
gender_num.head()
Out[660]:
Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons
11 505283.0 248346.0 256937.0
12 621210.0 309560.0 311650.0
13 499858.0 246065.0 253793.0
16 412905.0 201271.0 211634.0
17 526671.0 253764.0 272907.0
In [661]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat = cdist(gender_num, gender_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat.shape)
The shape of this matrix is (112, 112)
In [662]:
mds_gender = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_gender = mds_gender.fit_transform(dist_mat)
In [963]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_gender[:, 0], mds_x_gender[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_Gender['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_gender[index, 0], mds_x_gender[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
In [ ]:
 

Vehicle Accessibility per household in England Data

In [665]:
df_vehi=pd.read_csv("Vehicle accessibility per household in England Data.csv")
In [666]:
df_vehi = df_vehi.iloc[1: , :]
In [667]:
df_vehi
Out[667]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Car or van availability : Total\ Car or van availability - Unit : Households Car or van availability : Total\ Car or van availability - Unit : Cars or vans Car or van availability : No cars or vans in household - Unit : Households Car or van availability : 1 car or van in household - Unit : Households Car or van availability : 2 cars or vans in household - Unit : Households Car or van availability : 3 cars or vans in household - Unit : Households Car or van availability : 4 or more cars or vans in household - Unit : Households Car or van availability : Sum of all cars or vans - Unit : Cars or vans
1 4.0 E92000001 England Countries and Groupings CTRY 22063368.0 25696833.0 5691251.0 9301776.0 5441593.0 1203865.0 424883.0 25696832.0
2 8.0 E12000001 North East Regions RGN 1129935.0 1150133.0 355929.0 476508.0 238160.0 46036.0 13302.0 1150133.0
3 9.0 E12000002 North West Regions RGN 3009549.0 3296604.0 841667.0 1279984.0 707398.0 138371.0 42129.0 3296604.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 2224059.0 2451298.0 612903.0 954222.0 521858.0 102611.0 32465.0 2451298.0
5 11.0 E12000004 East Midlands Regions RGN 1895604.0 2356539.0 418999.0 805212.0 519487.0 113338.0 38568.0 2356539.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
219050 231883.0 E00176770 E00176770 Output Areas and Small Areas OASA 57.0 57.0 15.0 29.0 11.0 2.0 0.0 57.0
219051 231884.0 E00176771 E00176771 Output Areas and Small Areas OASA 44.0 73.0 7.0 11.0 22.0 2.0 2.0 73.0
219052 231885.0 E00176772 E00176772 Output Areas and Small Areas OASA 98.0 170.0 7.0 33.0 45.0 8.0 5.0 170.0
219053 231886.0 E00176773 E00176773 Output Areas and Small Areas OASA 107.0 85.0 35.0 59.0 13.0 0.0 0.0 85.0
219054 231887.0 E00176774 E00176774 Output Areas and Small Areas OASA 53.0 25.0 34.0 13.0 6.0 0.0 0.0 25.0

219054 rows × 13 columns

In [668]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [669]:
filtered_vehi = df_vehi[df_vehi['GEO_CODE'].isin(GeoCodes)]
filtered_vehi.head()
Out[669]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Car or van availability : Total\ Car or van availability - Unit : Households Car or van availability : Total\ Car or van availability - Unit : Cars or vans Car or van availability : No cars or vans in household - Unit : Households Car or van availability : 1 car or van in household - Unit : Households Car or van availability : 2 cars or vans in household - Unit : Households Car or van availability : 3 cars or vans in household - Unit : Households Car or van availability : 4 or more cars or vans in household - Unit : Households Car or van availability : Sum of all cars or vans - Unit : Cars or vans
11 17.0 E10000002 Buckinghamshire Counties CNTY 200727.0 314138.0 25261.0 75300.0 73419.0 18882.0 7865.0 314138.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 251241.0 343690.0 43588.0 106212.0 76970.0 17830.0 6641.0 343690.0
13 19.0 E10000006 Cumbria Counties CNTY 222042.0 273534.0 47578.0 99389.0 57798.0 12825.0 4452.0 273534.0
16 22.0 E10000009 Dorset Counties CNTY 180213.0 253649.0 28021.0 78377.0 54354.0 14024.0 5437.0 253649.0
17 23.0 E10000011 East Sussex Counties CNTY 231905.0 292118.0 50674.0 100340.0 60173.0 14750.0 5968.0 292118.0
In [670]:
filtered_vehi = filtered_vehi.drop('CDU_ID', 1)
#filtered_vehi = filtered_vehi.drop('CDU_ID', 1)
In [671]:
filtered_vehi.corr()
Out[671]:
Car or van availability : Total\ Car or van availability - Unit : Households Car or van availability : Total\ Car or van availability - Unit : Cars or vans Car or van availability : No cars or vans in household - Unit : Households Car or van availability : 1 car or van in household - Unit : Households Car or van availability : 2 cars or vans in household - Unit : Households Car or van availability : 3 cars or vans in household - Unit : Households Car or van availability : 4 or more cars or vans in household - Unit : Households Car or van availability : Sum of all cars or vans - Unit : Cars or vans
Car or van availability : Total\ Car or van availability - Unit : Households 1.000000 0.970569 0.894558 0.998370 0.953485 0.910057 0.850146 0.970569
Car or van availability : Total\ Car or van availability - Unit : Cars or vans 0.970569 1.000000 0.761521 0.972435 0.997572 0.981874 0.947556 1.000000
Car or van availability : No cars or vans in household - Unit : Households 0.894558 0.761521 1.000000 0.883632 0.720240 0.636016 0.541815 0.761521
Car or van availability : 1 car or van in household - Unit : Households 0.998370 0.972435 0.883632 1.000000 0.955715 0.911658 0.851096 0.972435
Car or van availability : 2 cars or vans in household - Unit : Households 0.953485 0.997572 0.720240 0.955715 1.000000 0.989693 0.960521 0.997572
Car or van availability : 3 cars or vans in household - Unit : Households 0.910057 0.981874 0.636016 0.911658 0.989693 1.000000 0.989454 0.981874
Car or van availability : 4 or more cars or vans in household - Unit : Households 0.850146 0.947556 0.541815 0.851096 0.960521 0.989454 1.000000 0.947556
Car or van availability : Sum of all cars or vans - Unit : Cars or vans 0.970569 1.000000 0.761521 0.972435 0.997572 0.981874 0.947556 1.000000
In [672]:
Car or van availability : Total\ Car or van availability - Unit : Households
        Car or van availability : Total\ Car or van availability - Unit : Cars or vans
                Car or van availability : No cars or vans in household - Unit : Households
                        Car or van availability : 1 car or van in household - Unit : Households
                                Car or van availability : 2 cars or vans in household - Unit : Households
                                        Car or van availability : 3 cars or vans in household - Unit : Households
                                                Car or van availability : 4 or more cars or vans in household - Unit : Households
                                                        Car or van availability : Sum of all cars or vans - Unit : Cars or vans
  File "<ipython-input-672-bca3a112b3a3>", line 1
    Car or van availability : Total\ Car or van availability - Unit : Households
               ^
SyntaxError: invalid syntax
In [673]:
filtered_vehi.rename(columns = {'Car or van availability : Total\ Car or van availability - Unit : Households':'Total Number of households which have access to a Vehicle ',
                                'Car or van availability : Total\ Car or van availability - Unit : Cars or vans':'Total number of cars which are privately accessible to citizens(Owned/leased)',
                                'Car or van availability : No cars or vans in household - Unit : Households':'Total Number of households which do not have access to vehicles',
                                'Car or van availability : 1 car or van in household - Unit : Households':'Number of households which have access to 1 vehicle',
                                'Car or van availability : 2 cars or vans in household - Unit : Households':'Number of households which have access to 2 vehicles',
                                'Car or van availability : 3 cars or vans in household - Unit : Households':'Number of households which have accessibilty to 3 vehicles',
                                'Car or van availability : 4 or more cars or vans in household - Unit : Households':'Number of households which have access to 4 or more vehicles'}, inplace = True)
In [1005]:
filtered_vehi.head()
Out[1005]:
GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles Number of households which have accessibilty to 3 vehicles Number of households which have access to 4 or more vehicles
11 E10000002 Buckinghamshire Counties CNTY 200727.0 314138.0 25261.0 75300.0 73419.0 18882.0 7865.0
12 E10000003 Cambridgeshire Counties CNTY 251241.0 343690.0 43588.0 106212.0 76970.0 17830.0 6641.0
13 E10000006 Cumbria Counties CNTY 222042.0 273534.0 47578.0 99389.0 57798.0 12825.0 4452.0
16 E10000009 Dorset Counties CNTY 180213.0 253649.0 28021.0 78377.0 54354.0 14024.0 5437.0
17 E10000011 East Sussex Counties CNTY 231905.0 292118.0 50674.0 100340.0 60173.0 14750.0 5968.0
In [1006]:
filtered_vehi = filtered_vehi.drop(['GEO_TYPE'], axis=1)
In [1007]:
filtered_vehi = filtered_vehi.drop(['GEO_TYP2'], axis=1)
In [1008]:
filtered_vehi.head()
Out[1008]:
GEO_CODE GEO_LABEL Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles Number of households which have accessibilty to 3 vehicles Number of households which have access to 4 or more vehicles
11 E10000002 Buckinghamshire 200727.0 314138.0 25261.0 75300.0 73419.0 18882.0 7865.0
12 E10000003 Cambridgeshire 251241.0 343690.0 43588.0 106212.0 76970.0 17830.0 6641.0
13 E10000006 Cumbria 222042.0 273534.0 47578.0 99389.0 57798.0 12825.0 4452.0
16 E10000009 Dorset 180213.0 253649.0 28021.0 78377.0 54354.0 14024.0 5437.0
17 E10000011 East Sussex 231905.0 292118.0 50674.0 100340.0 60173.0 14750.0 5968.0
In [ ]:
 
In [675]:
filtered_vehi = filtered_vehi.drop('Car or van availability : Sum of all cars or vans - Unit : Cars or vans', 1)
In [676]:
filtered_vehi
Out[676]:
GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles Number of households which have accessibilty to 3 vehicles Number of households which have access to 4 or more vehicles
11 E10000002 Buckinghamshire Counties CNTY 200727.0 314138.0 25261.0 75300.0 73419.0 18882.0 7865.0
12 E10000003 Cambridgeshire Counties CNTY 251241.0 343690.0 43588.0 106212.0 76970.0 17830.0 6641.0
13 E10000006 Cumbria Counties CNTY 222042.0 273534.0 47578.0 99389.0 57798.0 12825.0 4452.0
16 E10000009 Dorset Counties CNTY 180213.0 253649.0 28021.0 78377.0 54354.0 14024.0 5437.0
17 E10000011 East Sussex Counties CNTY 231905.0 292118.0 50674.0 100340.0 60173.0 14750.0 5968.0
... ... ... ... ... ... ... ... ... ... ... ...
366 E09000031 Waltham Forest Local Authorities LA 96861.0 76217.0 40583.0 40732.0 12225.0 2582.0 739.0
367 E09000032 Wandsworth Local Authorities LA 130493.0 89513.0 59143.0 56409.0 12634.0 1798.0 509.0
368 E41000052 Cornwall, Isles of Scilly Local Authorities LA 231378.0 311157.0 40429.0 103219.0 65012.0 16305.0 6413.0
369 E41000324 City of London, Westminster Local Authorities LA 110157.0 50502.0 69574.0 32969.0 6103.0 1051.0 460.0
8048 E02000001 City of London 001 Middle Super Output Areas and Intermediate Zones MSOAIZ 4385.0 1692.0 3043.0 1100.0 173.0 51.0 18.0

112 rows × 11 columns

In [71]:
filtered_vehi.corr()
Out[71]:
Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles Number of households which have accessibilty to 3 vehicles Number of households which have access to 4 or more vehicles
Total Number of households which have access to a Vehicle 1.000000 0.971847 0.916214 0.998577 0.954819 0.908653 0.841184
Total number of cars which are privately accessible to citizens(Owned/leased) 0.971847 1.000000 0.796828 0.973953 0.997508 0.980139 0.940253
Total Number of households which do not have access to vehicles 0.916214 0.796828 1.000000 0.906429 0.757433 0.672013 0.570064
Number of households which have access to 1 vehicle 0.998577 0.973953 0.906429 1.000000 0.957475 0.910785 0.842811
Number of households which have access to 2 vehicles 0.954819 0.997508 0.757433 0.957475 1.000000 0.988330 0.954229
Number of households which have accessibilty to 3 vehicles 0.908653 0.980139 0.672013 0.910785 0.988330 1.000000 0.987454
Number of households which have access to 4 or more vehicles 0.841184 0.940253 0.570064 0.842811 0.954229 0.987454 1.000000
In [964]:
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [ ]:
 
In [677]:
# Transform the data into a numpy ndarray
np_array_vehi = filtered_vehi.values
In [678]:
# Removing the first four columns
np_array_vehi = np_array_vehi[:, 4:]
In [679]:
np_array_vehi
Out[679]:
array([[200727.0, 314138.0, 25261.0, 75300.0, 73419.0, 18882.0, 7865.0],
       [251241.0, 343690.0, 43588.0, 106212.0, 76970.0, 17830.0, 6641.0],
       [222042.0, 273534.0, 47578.0, 99389.0, 57798.0, 12825.0, 4452.0],
       [180213.0, 253649.0, 28021.0, 78377.0, 54354.0, 14024.0, 5437.0],
       [231905.0, 292118.0, 50674.0, 100340.0, 60173.0, 14750.0, 5968.0],
       [581589.0, 795400.0, 104522.0, 244783.0, 171954.0, 42913.0,
        17417.0],
       [254615.0, 353020.0, 43480.0, 106491.0, 78426.0, 18960.0, 7258.0],
       [545254.0, 796638.0, 80005.0, 220861.0, 182611.0, 44568.0,
        17209.0],
       [453817.0, 626113.0, 76823.0, 191349.0, 140172.0, 32826.0,
        12647.0],
       [605638.0, 790956.0, 121094.0, 258442.0, 169541.0, 40616.0,
        15945.0],
       [496299.0, 599618.0, 113467.0, 215777.0, 130414.0, 27546.0,
        9095.0],
       [267434.0, 380884.0, 40821.0, 110471.0, 88377.0, 20566.0, 7199.0],
       [306971.0, 402731.0, 55407.0, 137635.0, 87271.0, 19598.0, 7060.0],
       [372085.0, 484255.0, 70096.0, 166827.0, 102064.0, 23973.0, 9125.0],
       [287730.0, 386622.0, 54332.0, 118068.0, 87951.0, 20204.0, 7175.0],
       [256594.0, 345907.0, 45716.0, 110448.0, 76275.0, 17670.0, 6485.0],
       [334303.0, 419187.0, 69687.0, 144941.0, 93973.0, 19461.0, 6241.0],
       [258855.0, 357634.0, 45340.0, 107358.0, 79519.0, 19148.0, 7490.0],
       [226989.0, 318697.0, 36092.0, 98217.0, 68317.0, 17352.0, 7011.0],
       [355263.0, 485343.0, 63893.0, 146504.0, 109218.0, 26358.0, 9290.0],
       [310745.0, 416500.0, 55661.0, 135039.0, 90758.0, 21254.0, 8033.0],
       [455791.0, 687072.0, 59865.0, 184249.0, 155920.0, 39607.0,
        16150.0],
       [231005.0, 319093.0, 40619.0, 94540.0, 72338.0, 17087.0, 6421.0],
       [345614.0, 464829.0, 61657.0, 150046.0, 100934.0, 23709.0, 9268.0],
       [239717.0, 341207.0, 39855.0, 96498.0, 76841.0, 18970.0, 7553.0],
       [602087.0, 569170.0, 211215.0, 247753.0, 116033.0, 21323.0,
        5763.0],
       [565442.0, 597538.0, 166934.0, 241096.0, 125950.0, 24370.0,
        7092.0],
       [484527.0, 436319.0, 178193.0, 201234.0, 86052.0, 15094.0, 3954.0],
       [1086748.0, 1123244.0, 341972.0, 451747.0, 228970.0, 48790.0,
        15269.0],
       [922452.0, 976961.0, 271399.0, 392213.0, 208849.0, 38218.0,
        11773.0],
       [40434.0, 38269.0, 14268.0, 16573.0, 7662.0, 1535.0, 396.0],
       [57203.0, 51821.0, 21488.0, 22963.0, 10207.0, 1945.0, 600.0],
       [59605.0, 64747.0, 16935.0, 25367.0, 13646.0, 2832.0, 825.0],
       [79159.0, 91623.0, 20524.0, 32728.0, 20487.0, 4222.0, 1198.0],
       [46670.0, 49794.0, 13052.0, 20682.0, 10450.0, 1962.0, 524.0],
       [53312.0, 59007.0, 14380.0, 22923.0, 12893.0, 2418.0, 698.0],
       [85140.0, 110349.0, 16409.0, 35587.0, 26623.0, 5049.0, 1472.0],
       [57353.0, 58447.0, 17502.0, 25110.0, 11819.0, 2256.0, 666.0],
       [112596.0, 90240.0, 45748.0, 47612.0, 16002.0, 2588.0, 646.0],
       [143032.0, 190419.0, 25200.0, 62802.0, 42328.0, 9463.0, 3239.0],
       [69707.0, 70625.0, 21481.0, 30338.0, 14469.0, 2638.0, 781.0],
       [70684.0, 89482.0, 14617.0, 30714.0, 19450.0, 4454.0, 1449.0],
       [83552.0, 90126.0, 21808.0, 38999.0, 18535.0, 3210.0, 1000.0],
       [102271.0, 108124.0, 29593.0, 44344.0, 22930.0, 4220.0, 1184.0],
       [123125.0, 110395.0, 45375.0, 51864.0, 20787.0, 3975.0, 1124.0],
       [15002.0, 22542.0, 1859.0, 6152.0, 5222.0, 1328.0, 441.0],
       [126131.0, 96284.0, 55093.0, 50614.0, 16785.0, 2830.0, 809.0],
       [78319.0, 111829.0, 12826.0, 32556.0, 23805.0, 6395.0, 2737.0],
       [66608.0, 84671.0, 13748.0, 28700.0, 18576.0, 4178.0, 1406.0],
       [107575.0, 107833.0, 33225.0, 47582.0, 21535.0, 4132.0, 1101.0],
       [73515.0, 92628.0, 16158.0, 31073.0, 19957.0, 4511.0, 1816.0],
       [182747.0, 190530.0, 52814.0, 82483.0, 37858.0, 7172.0, 2420.0],
       [88227.0, 121381.0, 15151.0, 36867.0, 27620.0, 6248.0, 2341.0],
       [107538.0, 158289.0, 14044.0, 45398.0, 36103.0, 8618.0, 3375.0],
       [109307.0, 115466.0, 30438.0, 49904.0, 23251.0, 4425.0, 1289.0],
       [59010.0, 66852.0, 15063.0, 26678.0, 13235.0, 2926.0, 1108.0],
       [82374.0, 92927.0, 21333.0, 36807.0, 18763.0, 3961.0, 1510.0],
       [63530.0, 84638.0, 11245.0, 28006.0, 18431.0, 4322.0, 1526.0],
       [88360.0, 107747.0, 19060.0, 39031.0, 24098.0, 4722.0, 1449.0],
       [74023.0, 83910.0, 18412.0, 33372.0, 17712.0, 3452.0, 1075.0],
       [74293.0, 80830.0, 20389.0, 32923.0, 16499.0, 3440.0, 1042.0],
       [74678.0, 81304.0, 20383.0, 33268.0, 16574.0, 3406.0, 1047.0],
       [62353.0, 79764.0, 12527.0, 27384.0, 17007.0, 3973.0, 1462.0],
       [106209.0, 133257.0, 23231.0, 45163.0, 28790.0, 6609.0, 2416.0],
       [62340.0, 96617.0, 7759.0, 24042.0, 22437.0, 5858.0, 2244.0],
       [98584.0, 128029.0, 18656.0, 42633.0, 29364.0, 6009.0, 1922.0],
       [121540.0, 104397.0, 46415.0, 51727.0, 19078.0, 3294.0, 1026.0],
       [85473.0, 81389.0, 28533.0, 37396.0, 15776.0, 2948.0, 820.0],
       [98254.0, 101546.0, 28996.0, 43938.0, 20099.0, 3969.0, 1252.0],
       [61085.0, 72788.0, 13761.0, 28314.0, 14356.0, 3468.0, 1186.0],
       [223803.0, 248586.0, 60926.0, 96086.0, 52740.0, 10775.0, 3276.0],
       [138534.0, 168974.0, 30543.0, 60875.0, 36916.0, 7671.0, 2529.0],
       [159441.0, 224179.0, 25709.0, 65543.0, 52217.0, 11763.0, 4209.0],
       [129674.0, 184779.0, 20483.0, 54671.0, 39927.0, 10291.0, 4302.0],
       [194194.0, 280840.0, 28660.0, 80315.0, 63879.0, 15513.0, 5827.0],
       [63812.0, 83041.0, 13137.0, 26678.0, 18037.0, 4323.0, 1637.0],
       [30744.0, 45168.0, 4557.0, 12383.0, 10233.0, 2506.0, 1065.0],
       [32758.0, 48250.0, 4696.0, 13717.0, 10443.0, 2686.0, 1216.0],
       [204969.0, 150722.0, 91248.0, 83276.0, 25598.0, 3779.0, 1068.0],
       [69681.0, 56966.0, 27618.0, 30183.0, 9557.0, 1859.0, 464.0],
       [92604.0, 108507.0, 21918.0, 42054.0, 21708.0, 5204.0, 1720.0],
       [110286.0, 87802.0, 47419.0, 43598.0, 14884.0, 3429.0, 956.0],
       [130862.0, 153908.0, 30723.0, 59613.0, 30909.0, 7072.0, 2545.0],
       [97534.0, 46601.0, 59595.0, 30991.0, 5757.0, 912.0, 279.0],
       [145010.0, 140049.0, 48523.0, 63183.0, 25836.0, 5571.0, 1897.0],
       [124082.0, 112845.0, 43847.0, 54846.0, 20017.0, 4093.0, 1279.0],
       [119916.0, 119653.0, 38933.0, 51872.0, 21997.0, 5328.0, 1786.0],
       [101045.0, 78185.0, 42455.0, 43058.0, 12523.0, 2321.0, 688.0],
       [101690.0, 41800.0, 65721.0, 31157.0, 4129.0, 525.0, 158.0],
       [80590.0, 43843.0, 44524.0, 29496.0, 5661.0, 730.0, 179.0],
       [101955.0, 61515.0, 52851.0, 38911.0, 8507.0, 1333.0, 353.0],
       [84268.0, 100326.0, 19811.0, 37013.0, 20993.0, 5012.0, 1439.0],
       [97199.0, 117634.0, 22364.0, 42968.0, 23831.0, 5910.0, 2126.0],
       [100214.0, 122486.0, 22716.0, 43942.0, 25193.0, 6125.0, 2238.0],
       [94902.0, 94042.0, 29985.0, 42744.0, 16986.0, 3942.0, 1245.0],
       [93556.0, 38629.0, 60485.0, 28507.0, 3914.0, 490.0, 160.0],
       [78536.0, 44536.0, 43982.0, 26866.0, 6204.0, 1046.0, 438.0],
       [63639.0, 70421.0, 15997.0, 30084.0, 13740.0, 2861.0, 957.0],
       [130017.0, 66791.0, 75214.0, 45077.0, 8147.0, 1219.0, 360.0],
       [116091.0, 76507.0, 55893.0, 46991.0, 10829.0, 1875.0, 503.0],
       [78757.0, 72777.0, 25644.0, 37557.0, 12462.0, 2377.0, 717.0],
       [101519.0, 61092.0, 52849.0, 38531.0, 8405.0, 1386.0, 348.0],
       [99105.0, 106339.0, 27697.0, 44548.0, 20684.0, 4762.0, 1414.0],
       [79835.0, 84918.0, 19751.0, 40079.0, 16289.0, 2926.0, 790.0],
       [120422.0, 60438.0, 70312.0, 41622.0, 7173.0, 1033.0, 282.0],
       [78174.0, 91266.0, 18303.0, 36110.0, 18214.0, 4118.0, 1429.0],
       [101257.0, 43589.0, 63797.0, 32329.0, 4450.0, 521.0, 160.0],
       [96861.0, 76217.0, 40583.0, 40732.0, 12225.0, 2582.0, 739.0],
       [130493.0, 89513.0, 59143.0, 56409.0, 12634.0, 1798.0, 509.0],
       [231378.0, 311157.0, 40429.0, 103219.0, 65012.0, 16305.0, 6413.0],
       [110157.0, 50502.0, 69574.0, 32969.0, 6103.0, 1051.0, 460.0],
       [4385.0, 1692.0, 3043.0, 1100.0, 173.0, 51.0, 18.0]], dtype=object)
In [680]:
# Building the PCA model
pca_vehi = PCA(n_components=2).fit(np_array_vehi) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_v = pca_vehi.transform(np_array_vehi) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_v variable has a shape of :", pca_x_v.shape)
print(pca_x_v)
The pca_x_v variable has a shape of : (112, 2)
[[ 9.61750740e+04 -6.66812416e+04]
 [ 1.58773896e+05 -4.09903419e+04]
 [ 8.56503328e+04 -1.21494495e+04]
 [ 3.76430471e+04 -3.76677410e+04]
 [ 1.06255428e+05 -1.59290108e+04]
 [ 7.45621930e+05 -7.85747919e+04]
 [ 1.67941626e+05 -4.47861852e+04]
 [ 7.17154365e+05 -1.19054245e+05]
 [ 5.22839982e+05 -7.03582621e+04]
 [ 7.61990439e+05 -5.07090109e+04]
 [ 5.38298247e+05 -5.24276926e+03]
 [ 1.98230783e+05 -5.64096590e+04]
 [ 2.46457230e+05 -3.41959232e+04]
 [ 3.56839049e+05 -3.47384655e+04]
 [ 2.18251044e+05 -3.95725193e+04]
 [ 1.64834254e+05 -3.72712049e+04]
 [ 2.79676622e+05 -2.05748256e+04]
 [ 1.74505319e+05 -4.41065125e+04]
 [ 1.21585462e+05 -4.39643199e+04]
 [ 3.42835816e+05 -5.29018811e+04]
 [ 2.58780456e+05 -4.09680742e+04]
 [ 5.67214849e+05 -1.17928764e+05]
 [ 1.24598349e+05 -4.07968466e+04]
 [ 3.21252247e+05 -4.55027492e+04]
 [ 1.47145561e+05 -4.96252366e+04]
 [ 5.98287514e+05  1.33444925e+05]
 [ 5.91015185e+05  6.91505815e+04]
 [ 4.09693439e+05  1.24193081e+05]
 [ 1.38117640e+06  1.68885795e+05]
 [ 1.14776172e+06  1.16695209e+05]
 [-2.28624173e+05 -4.06262830e+02]
 [-2.05674089e+05  5.70699146e+03]
 [-1.94239159e+05 -3.18797677e+03]
 [-1.59428461e+05 -5.86616249e+03]
 [-2.15152198e+05 -4.09471499e+03]
 [-2.03297741e+05 -5.05553068e+03]
 [-1.41014960e+05 -1.62789459e+04]
 [-2.00486894e+05 -1.90822310e+02]
 [-1.34001875e+05  3.08494288e+04]
 [-3.71821526e+04 -2.37842061e+04]
 [-1.81905193e+05  2.24598334e+03]
 [-1.67534456e+05 -1.27704393e+04]
 [-1.56491017e+05 -4.32785487e+02]
 [-1.28996822e+05  3.99129713e+03]
 [-1.11150538e+05  2.46866025e+04]
 [-2.60044902e+05 -1.36511325e+04]
 [-1.19314381e+05  4.04407142e+04]
 [-1.45651379e+05 -2.28981714e+04]
 [-1.74261246e+05 -1.29136236e+04]
 [-1.24948901e+05  9.90911001e+03]
 [-1.63153233e+05 -1.21737038e+04]
 [-5.39973249e+03  1.76860671e+04]
 [-1.30749778e+05 -2.15928849e+04]
 [-8.88059991e+04 -3.29230230e+04]
 [-1.17853145e+05  4.99654972e+03]
 [-1.93044404e+05 -5.47461632e+03]
 [-1.55695942e+05 -3.37602103e+03]
 [-1.76662000e+05 -1.60876392e+04]
 [-1.40172079e+05 -1.04800984e+04]
 [-1.68733233e+05 -4.82112807e+03]
 [-1.70855197e+05 -1.65318948e+03]
 [-1.70182088e+05 -1.66933247e+03]
 [-1.81144992e+05 -1.30575197e+04]
 [-1.07923013e+05 -1.28061722e+04]
 [-1.69399660e+05 -2.69911235e+04]
 [-1.17505096e+05 -1.70890571e+04]
 [-1.16667577e+05  2.81480547e+04]
 [-1.61649184e+05  9.64649220e+03]
 [-1.36846073e+05  5.66947248e+03]
 [-1.87037757e+05 -8.41362057e+03]
 [ 6.85995707e+04  1.10274770e+04]
 [-5.62165804e+04 -1.02616870e+04]
 [-2.84775564e+02 -3.52105381e+04]
 [-5.23213101e+04 -3.14786442e+04]
 [ 6.80118862e+04 -4.66448270e+04]
 [-1.77793566e+05 -1.41007579e+04]
 [-2.31319913e+05 -1.63328401e+04]
 [-2.27468128e+05 -1.67281526e+04]
 [-1.77734188e+04  7.67777888e+04]
 [-1.91902517e+05  1.43943769e+04]
 [-1.36284214e+05 -6.05291101e+03]
 [-1.38085611e+05  3.14187496e+04]
 [-7.30771042e+04 -4.85027244e+03]
 [-1.78842598e+05  5.44895534e+04]
 [-7.22551698e+04  2.24388753e+04]
 [-1.08366347e+05  2.35860528e+04]
 [-1.06944628e+05  1.37368093e+04]
 [-1.51875044e+05  2.94331613e+04]
 [-1.79251346e+05  6.33046212e+04]
 [-1.93467699e+05  3.80466312e+04]
 [-1.63792726e+05  4.53929921e+04]
 [-1.48926287e+05 -7.84546398e+03]
 [-1.26210560e+05 -8.75142183e+03]
 [-1.20344635e+05 -9.76389274e+03]
 [-1.44991017e+05  9.04582995e+03]
 [-1.87855287e+05  5.73013366e+04]
 [-1.94824287e+05  3.56753195e+04]
 [-1.86608805e+05 -3.97164806e+03]
 [-1.38584394e+05  7.14820662e+04]
 [-1.41548607e+05  4.72005007e+04]
 [-1.72862318e+05  9.90919941e+03]
 [-1.64472559e+05  4.53476350e+04]
 [-1.32738812e+05  2.62714429e+03]
 [-1.62900184e+05 -5.81988499e+01]
 [-1.50669520e+05  6.66914841e+04]
 [-1.60096690e+05 -6.46351905e+03]
 [-1.78123629e+05  6.10971454e+04]
 [-1.56694658e+05  2.68615210e+04]
 [-1.20337240e+05  5.07524549e+04]
 [ 1.19987157e+05 -3.33361268e+04]
 [-1.66508487e+05  6.50757594e+04]
 [-2.83582895e+05 -6.77000281e+03]]
In [681]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_v[:,0], pca_x_v[:,1],cmap='viridis')
Out[681]:
<matplotlib.collections.PathCollection at 0x7f97b5bd63d0>
In [682]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_vehi.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_vehi.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_vehi.components_[1])[np.argsort(np.abs(pca_vehi.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_vehi.columns[4+np.argsort(np.abs(pca_vehi.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Total number of cars which are privately accessible to citizens(Owned/leased)" has a loading of : 0.7308515422440944
The Column "Total Number of households which have access to a Vehicle " has a loading of : 0.5971430141477181
The Column "Number of households which have access to 1 vehicle" has a loading of : 0.2516932104151973
The Column "Number of households which have access to 2 vehicles" has a loading of : 0.1589297133774468
The Column "Total Number of households which do not have access to vehicles" has a loading of : 0.1388245670489415
The Column "Number of households which have accessibilty to 3 vehicles" has a loading of : 0.03521495133677829
The Column "Number of households which have access to 4 or more vehicles" has a loading of : 0.012480571969354176

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Total Number of households which do not have access to vehicles" has a loading of : 0.6069706908142035
The Column "Total number of cars which are privately accessible to citizens(Owned/leased)" has a loading of : -0.5486278606850963
The Column "Total Number of households which have access to a Vehicle " has a loading of : 0.5053124700331045
The Column "Number of households which have access to 1 vehicle" has a loading of : 0.18744066264158202
The Column "Number of households which have access to 2 vehicles" has a loading of : -0.18459802045749993
The Column "Number of households which have accessibilty to 3 vehicles" has a loading of : -0.0693195886540595
The Column "Number of households which have access to 4 or more vehicles" has a loading of : -0.03518127431112231
In [683]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Total Number of households which do not have access to vehicles', data=filtered_vehi)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [684]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Number of households which have access to 1 vehicle', data=filtered_vehi)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [685]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)

vehi_num = filtered_vehi.select_dtypes(include='number') # selects numeric columns only
vehi_num.head()
Out[685]:
Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles Number of households which have accessibilty to 3 vehicles Number of households which have access to 4 or more vehicles
11 200727.0 314138.0 25261.0 75300.0 73419.0 18882.0 7865.0
12 251241.0 343690.0 43588.0 106212.0 76970.0 17830.0 6641.0
13 222042.0 273534.0 47578.0 99389.0 57798.0 12825.0 4452.0
16 180213.0 253649.0 28021.0 78377.0 54354.0 14024.0 5437.0
17 231905.0 292118.0 50674.0 100340.0 60173.0 14750.0 5968.0
In [686]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat = cdist(vehi_num, vehi_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat.shape)
The shape of this matrix is (112, 112)
In [687]:
mds_vehi = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_vehi = mds_vehi.fit_transform(dist_mat)
In [965]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_vehi[:, 0], mds_x_vehi[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_vehi['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_vehi[index, 0], mds_x_vehi[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
In [ ]:
 
In [ ]:
 
In [ ]:
 

Fitting first to fourth order polynomials USE this when merging all PCA calculations

plt.figure(num=2, figsize = (16, 10)) plt.subplot(2,2,1) ax1 = sns.regplot(filtered_vehi['White %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax1.set(title = 'Regression : Order 1') plt.subplot(2,2,2) ax2 = sns.regplot(filtered_vehi['Black %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax2.set(title = 'Regression : Order 2') plt.subplot(2,2,3) ax3 = sns.regplot(filtered_vehi['Asian %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax3.set(title = 'Regression : Order 3') plt.subplot(2,2,4) ax4 = sns.regplot(filtered_vehi['Mixed %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax4.set(title = 'Regression : Order 4') plt.tight_layout();

In [ ]:
 
In [ ]:
 

Data on religion in England

In [689]:
df_reli=pd.read_csv("data on religion of England population .csv")
In [690]:
df_reli = df_reli.iloc[1: , :]
In [691]:
df_reli
Out[691]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Religion [E][S][W] : Total\ Religion - Unit : Persons Religion [E][S][W] : Christian - Unit : Persons Religion [E][S][W] : Buddhist - Unit : Persons Religion [E][S][W] : Hindu - Unit : Persons Religion [E][S][W] : Jewish - Unit : Persons Religion [E][S][W] : Muslim - Unit : Persons Religion [E][S][W] : Sikh - Unit : Persons Religion [E][S][W] : Other religion - Unit : Persons Religion [E][S][W] : No religion - Unit : Persons Religion [E][S][W] : Religion not stated - Unit : Persons
1 4.0 E92000001 England Countries and Groupings CTRY 53012456.0 31479876.0 238626.0 806199.0 261282.0 2660116.0 420196.0 227825.0 13114232.0 3804104.0
2 8.0 E12000001 North East Regions RGN 2596886.0 1753334.0 6316.0 7772.0 4503.0 46764.0 5964.0 6668.0 607700.0 157865.0
3 9.0 E12000002 North West Regions RGN 7052177.0 4742860.0 20695.0 38259.0 30417.0 356458.0 8857.0 19166.0 1397916.0 437549.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 5283733.0 3143819.0 14319.0 24074.0 9929.0 326050.0 22179.0 16517.0 1366219.0 360627.0
5 11.0 E12000004 East Midlands Regions RGN 4533222.0 2666172.0 12672.0 89723.0 4254.0 140649.0 44335.0 17918.0 1248056.0 309443.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
219050 231883.0 E00176770 E00176770 Output Areas and Small Areas OASA 952.0 603.0 7.0 2.0 1.0 26.0 0.0 13.0 239.0 61.0
219051 231884.0 E00176771 E00176771 Output Areas and Small Areas OASA 621.0 411.0 1.0 0.0 10.0 2.0 2.0 3.0 172.0 20.0
219052 231885.0 E00176772 E00176772 Output Areas and Small Areas OASA 256.0 203.0 1.0 9.0 2.0 1.0 0.0 0.0 27.0 13.0
219053 231886.0 E00176773 E00176773 Output Areas and Small Areas OASA 169.0 122.0 0.0 0.0 0.0 1.0 0.0 0.0 32.0 14.0
219054 231887.0 E00176774 E00176774 Output Areas and Small Areas OASA 102.0 88.0 0.0 0.0 1.0 0.0 0.0 1.0 7.0 5.0

219054 rows × 15 columns

In [692]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [693]:
filtered_reli = df_reli[df_reli['GEO_CODE'].isin(GeoCodes)]
filtered_reli.head()
Out[693]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Religion [E][S][W] : Total\ Religion - Unit : Persons Religion [E][S][W] : Christian - Unit : Persons Religion [E][S][W] : Buddhist - Unit : Persons Religion [E][S][W] : Hindu - Unit : Persons Religion [E][S][W] : Jewish - Unit : Persons Religion [E][S][W] : Muslim - Unit : Persons Religion [E][S][W] : Sikh - Unit : Persons Religion [E][S][W] : Other religion - Unit : Persons Religion [E][S][W] : No religion - Unit : Persons Religion [E][S][W] : Religion not stated - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 505283.0 305804.0 2207.0 6244.0 1511.0 25781.0 4657.0 1803.0 121190.0 36086.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 621210.0 361532.0 3264.0 4142.0 1652.0 8990.0 895.0 2636.0 189016.0 49083.0
13 19.0 E10000006 Cumbria Counties CNTY 499858.0 359235.0 1353.0 559.0 203.0 1336.0 64.0 1364.0 101496.0 34248.0
16 22.0 E10000009 Dorset Counties CNTY 412905.0 269737.0 1280.0 550.0 519.0 1318.0 88.0 2230.0 104221.0 32962.0
17 23.0 E10000011 East Sussex Counties CNTY 526671.0 315659.0 2190.0 1501.0 1074.0 4201.0 178.0 3508.0 155723.0 42637.0
In [966]:
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1009]:
filtered_reli = filtered_reli.drop(['GEO_TYP2'], axis=1)
In [1010]:
filtered_reli = filtered_reli.drop(['GEO_TYPE'], axis=1)
In [1011]:
filtered_reli.head()
Out[1011]:
GEO_CODE GEO_LABEL Religion [E][S][W] : Total\ Religion - Unit : Persons Religion [E][S][W] : Christian - Unit : Persons Religion [E][S][W] : Buddhist - Unit : Persons Religion [E][S][W] : Hindu - Unit : Persons Religion [E][S][W] : Jewish - Unit : Persons Religion [E][S][W] : Muslim - Unit : Persons Religion [E][S][W] : Sikh - Unit : Persons Religion [E][S][W] : Other religion - Unit : Persons Religion [E][S][W] : No religion - Unit : Persons Religion [E][S][W] : Religion not stated - Unit : Persons
11 E10000002 Buckinghamshire 505283.0 305804.0 2207.0 6244.0 1511.0 25781.0 4657.0 1803.0 121190.0 36086.0
12 E10000003 Cambridgeshire 621210.0 361532.0 3264.0 4142.0 1652.0 8990.0 895.0 2636.0 189016.0 49083.0
13 E10000006 Cumbria 499858.0 359235.0 1353.0 559.0 203.0 1336.0 64.0 1364.0 101496.0 34248.0
16 E10000009 Dorset 412905.0 269737.0 1280.0 550.0 519.0 1318.0 88.0 2230.0 104221.0 32962.0
17 E10000011 East Sussex 526671.0 315659.0 2190.0 1501.0 1074.0 4201.0 178.0 3508.0 155723.0 42637.0
In [ ]:
 
In [694]:
# Transform the data into a numpy ndarray
np_array_reli = filtered_reli.values
In [695]:
# Removing the first four columns
np_array_reli = np_array_reli[:, 5:]
In [696]:
np_array_reli
Out[696]:
array([[505283.0, 305804.0, 2207.0, ..., 1803.0, 121190.0, 36086.0],
       [621210.0, 361532.0, 3264.0, ..., 2636.0, 189016.0, 49083.0],
       [499858.0, 359235.0, 1353.0, ..., 1364.0, 101496.0, 34248.0],
       ...,
       [534476.0, 319575.0, 1735.0, ..., 3621.0, 162062.0, 45570.0],
       [226771.0, 101221.0, 3286.0, ..., 1308.0, 47064.0, 21170.0],
       [7375.0, 3344.0, 92.0, ..., 28.0, 2522.0, 651.0]], dtype=object)
In [697]:
# Building the PCA model
pca_reli = PCA(n_components=2).fit(np_array_reli) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_r = pca_reli.transform(np_array_reli) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_r variable has a shape of :", pca_x_r.shape)
print(pca_x_r)
The pca_x_r variable has a shape of : (112, 2)
[[ 9.73183749e+04 -3.74269731e+03]
 [ 2.35717394e+05 -1.48016095e+04]
 [ 1.14502048e+05 -5.51210427e+04]
 [-3.00697187e+03 -3.02388561e+04]
 [ 1.26245994e+05 -2.06354011e+04]
 [ 1.17749058e+06 -6.53282887e+04]
 [ 2.18094720e+05 -3.45272010e+04]
 [ 1.08902152e+06 -6.96222905e+04]
 [ 8.19703084e+05 -1.07620078e+04]
 [ 1.26402991e+06 -7.25635821e+04]
 [ 9.30870412e+05 -6.86025353e+04]
 [ 2.73030465e+05 -1.98530862e+04]
 [ 3.72509341e+05 -6.43694803e+04]
 [ 5.28966794e+05 -4.00001098e+04]
 [ 3.24262929e+05 -2.09870616e+04]
 [ 2.31959982e+05 -5.70318297e+04]
 [ 4.42667566e+05 -3.61704610e+04]
 [ 2.78344588e+05 -1.84541589e+04]
 [ 1.37852071e+05 -3.49599279e+04]
 [ 5.36774049e+05 -6.87607806e+04]
 [ 3.72118014e+05 -3.27304097e+04]
 [ 8.60974339e+05 -4.71370440e+04]
 [ 1.55380280e+05 -3.18363560e+04]
 [ 4.66623691e+05 -3.37346005e+04]
 [ 1.88156486e+05 -4.40766922e+04]
 [ 1.21791377e+06 -1.55428322e+05]
 [ 1.10968062e+06 -2.48599857e+04]
 [ 8.39434035e+05 -6.02474764e+04]
 [ 2.66305370e+06  2.51121483e+05]
 [ 2.10074251e+06  1.56676365e+05]
 [-3.93329434e+05 -9.57669603e+03]
 [-3.40186124e+05 -2.26592987e+03]
 [-3.39645298e+05 -1.43350745e+04]
 [-2.71546743e+05 -1.50861133e+04]
 [-3.77547208e+05 -9.15909180e+03]
 [-3.49345124e+05 -1.68766713e+04]
 [-2.56159315e+05 -2.10830291e+04]
 [-3.38210047e+05  2.79414734e+04]
 [-2.02659082e+05 -1.28800125e+03]
 [-9.58840071e+04 -2.96240380e+04]
 [-3.14493841e+05 -7.66855095e+03]
 [-3.02544173e+05 -1.16038204e+04]
 [-2.69252259e+05 -7.47962303e+03]
 [-2.17320320e+05  1.40987903e+04]
 [-1.57924365e+05  9.26592770e+04]
 [-4.61237769e+05 -4.12789300e+03]
 [-1.59364642e+05  3.36282817e+04]
 [-2.81781512e+05 -1.65293252e+04]
 [-3.06043846e+05 -7.28520810e+03]
 [-2.08273053e+05 -2.06125224e+03]
 [-2.97451068e+05 -4.28714153e+03]
 [-1.30702493e+04  2.84310415e+04]
 [-2.62432105e+05 -1.04256455e+04]
 [-1.91196484e+05 -1.01284391e+04]
 [-1.99725337e+05 -8.24639528e+03]
 [-3.48077807e+05 -8.72394425e+03]
 [-2.88755154e+05 -3.36648463e+03]
 [-3.29276845e+05 -6.98978709e+03]
 [-2.57680799e+05 -4.16326007e+03]
 [-2.90484570e+05  7.50045086e+03]
 [-2.78765655e+05  4.28573275e+04]
 [-3.01304277e+05 -1.71268470e+03]
 [-3.15994178e+05 -8.20660409e+03]
 [-1.92387272e+05 -4.61997902e+03]
 [-3.20301616e+05 -9.75870573e+03]
 [-2.15747137e+05  8.08762483e+03]
 [-1.94710694e+05  1.79261393e+04]
 [-2.66241821e+05  4.26423933e+03]
 [-2.30253971e+05  7.95282204e+03]
 [-3.40550716e+05 -6.96023149e+03]
 [ 1.31966719e+05 -5.69703116e+04]
 [-1.17240604e+05 -2.94120483e+04]
 [-5.06712401e+04 -3.37364142e+04]
 [-1.29803901e+05 -2.84544427e+04]
 [ 6.60488587e+04 -3.06289691e+04]
 [-3.19923830e+05  1.92832029e+02]
 [-4.19516407e+05 -7.38304599e+03]
 [-4.13075540e+05 -5.11688706e+03]
 [ 6.86822177e+04  6.81466891e+04]
 [-2.90200639e+05  1.52079067e+04]
 [-2.28031754e+05 -8.28656973e+03]
 [-1.70736596e+05  7.23455224e+04]
 [-1.35946649e+05 -8.77820763e+03]
 [-2.70213901e+05  4.47121087e+04]
 [-8.21514194e+04  1.68446207e+04]
 [-1.34854796e+05  6.54725192e+04]
 [-1.47679557e+05  3.78579866e+04]
 [-2.11384645e+05  1.33339871e+04]
 [-2.36374104e+05  4.52378424e+04]
 [-2.94291531e+05  1.11758717e+04]
 [-2.20278469e+05  3.76491762e+04]
 [-2.54975051e+05  5.45270936e+04]
 [-2.18414542e+05 -1.43450254e+04]
 [-1.98020699e+05  3.30594488e+04]
 [-2.29833375e+05  4.96095609e+04]
 [-2.77846970e+05  2.91279666e+04]
 [-3.22940306e+05  9.89204030e+03]
 [-3.21187715e+05  7.17586515e+03]
 [-1.53051022e+05  1.41547193e+04]
 [-1.85746461e+05  1.28350285e+04]
 [-2.73790992e+05  9.18848090e+03]
 [-1.75424466e+05  9.90634504e+04]
 [-2.11297133e+05  8.12646830e+04]
 [-2.86875810e+05  1.38519979e+03]
 [-1.71698585e+05  1.75654958e+04]
 [-2.81605244e+05 -3.31031504e+02]
 [-2.43830988e+05  1.01742244e+05]
 [-2.15016532e+05  4.75825506e+04]
 [-1.49303044e+05  1.73016455e+04]
 [ 1.35996338e+05 -2.31533368e+04]
 [-2.53895819e+05  4.02690944e+04]
 [-4.98840171e+05 -2.47740547e+02]]
In [698]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_r[:,0], pca_x_r[:,1],cmap='viridis')
Out[698]:
<matplotlib.collections.PathCollection at 0x7f97b8cb7e80>
In [699]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_reli.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_reli.columns[5+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_reli.components_[1])[np.argsort(np.abs(pca_reli.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_reli.columns[5+np.argsort(np.abs(pca_reli.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Religion [E][S][W] : Total\ Religion - Unit : Persons" has a loading of : 0.8358223528984076
The Column "Religion [E][S][W] : Christian - Unit : Persons" has a loading of : 0.5062141112136527
The Column "Religion [E][S][W] : No religion - Unit : Persons" has a loading of : 0.19864300475868618
The Column "Religion [E][S][W] : Religion not stated - Unit : Persons" has a loading of : 0.05474763482440188
The Column "Religion [E][S][W] : Muslim - Unit : Persons" has a loading of : 0.04972111332311746
The Column "Religion [E][S][W] : Sikh - Unit : Persons" has a loading of : 0.012401914922459565
The Column "Religion [E][S][W] : Hindu - Unit : Persons" has a loading of : 0.006843167980654495

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Religion [E][S][W] : Muslim - Unit : Persons" has a loading of : 0.677264993860754
The Column "Religion [E][S][W] : Christian - Unit : Persons" has a loading of : -0.6134195391977886
The Column "Religion [E][S][W] : Total\ Religion - Unit : Persons" has a loading of : 0.3427078474029192
The Column "Religion [E][S][W] : Sikh - Unit : Persons" has a loading of : 0.14791215089594184
The Column "Religion [E][S][W] : Hindu - Unit : Persons" has a loading of : 0.14073736372564977
The Column "Religion [E][S][W] : No religion - Unit : Persons" has a loading of : -0.06958231722652441
The Column "Religion [E][S][W] : Religion not stated - Unit : Persons" has a loading of : 0.024656872502235576
In [700]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Christian - Unit : Persons', data=filtered_reli)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [701]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Muslim - Unit : Persons', data=filtered_reli)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [702]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Hindu - Unit : Persons', data=filtered_reli)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [703]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Sikh - Unit : Persons', data=filtered_reli)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [704]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Total\ Religion - Unit : Persons', data=filtered_reli)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [705]:
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))

# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : No religion - Unit : Persons', data=filtered_reli)

# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
In [ ]:
 
In [706]:
filtered_reli = filtered_reli.drop('CDU_ID', 1)
#filtered_vehi = filtered_vehi.drop('CDU_ID', 1)
In [707]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)

reli_num = filtered_reli.select_dtypes(include='number') # selects numeric columns only
reli_num.head()
Out[707]:
Religion [E][S][W] : Total\ Religion - Unit : Persons Religion [E][S][W] : Christian - Unit : Persons Religion [E][S][W] : Buddhist - Unit : Persons Religion [E][S][W] : Hindu - Unit : Persons Religion [E][S][W] : Jewish - Unit : Persons Religion [E][S][W] : Muslim - Unit : Persons Religion [E][S][W] : Sikh - Unit : Persons Religion [E][S][W] : Other religion - Unit : Persons Religion [E][S][W] : No religion - Unit : Persons Religion [E][S][W] : Religion not stated - Unit : Persons
11 505283.0 305804.0 2207.0 6244.0 1511.0 25781.0 4657.0 1803.0 121190.0 36086.0
12 621210.0 361532.0 3264.0 4142.0 1652.0 8990.0 895.0 2636.0 189016.0 49083.0
13 499858.0 359235.0 1353.0 559.0 203.0 1336.0 64.0 1364.0 101496.0 34248.0
16 412905.0 269737.0 1280.0 550.0 519.0 1318.0 88.0 2230.0 104221.0 32962.0
17 526671.0 315659.0 2190.0 1501.0 1074.0 4201.0 178.0 3508.0 155723.0 42637.0
In [708]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_reli = cdist(reli_num, reli_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_reli.shape)
The shape of this matrix is (112, 112)
In [709]:
mds_reli = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_reli = mds_reli.fit_transform(dist_mat_reli)
In [967]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_reli[:, 0], mds_x_reli[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_reli['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_reli[index, 0], mds_x_reli[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
In [ ]:
 

Data on population health England

In [711]:
df_health=pd.read_csv("Data on population health in England .csv")
df_health = df_health.iloc[1: , :]
In [712]:
df_health
Out[712]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 General health : Very bad health - Unit : Persons General health : Total\ General health - Unit : Persons General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons
1 4.0 E92000001 England Countries and Groupings CTRY 660749.0 53012456.0 25005712.0 18141457.0 6954092.0 2250446.0
2 8.0 E12000001 North East Regions RGN 44064.0 2596886.0 1142170.0 866035.0 395243.0 149374.0
3 9.0 E12000002 North West Regions RGN 106504.0 7052177.0 3276592.0 2314301.0 982586.0 372194.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 70694.0 5283733.0 2407907.0 1817231.0 739959.0 247942.0
5 11.0 E12000004 East Midlands Regions RGN 56258.0 4533222.0 2053334.0 1593206.0 634414.0 196010.0
... ... ... ... ... ... ... ... ... ... ... ...
219050 231883.0 E00176770 E00176770 Output Areas and Small Areas OASA 25.0 952.0 265.0 386.0 210.0 66.0
219051 231884.0 E00176771 E00176771 Output Areas and Small Areas OASA 0.0 621.0 358.0 224.0 36.0 3.0
219052 231885.0 E00176772 E00176772 Output Areas and Small Areas OASA 4.0 256.0 144.0 72.0 27.0 9.0
219053 231886.0 E00176773 E00176773 Output Areas and Small Areas OASA 1.0 169.0 93.0 55.0 15.0 5.0
219054 231887.0 E00176774 E00176774 Output Areas and Small Areas OASA 4.0 102.0 35.0 26.0 26.0 11.0

219054 rows × 11 columns

In [713]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [714]:
filtered_health = df_health[df_health['GEO_CODE'].isin(GeoCodes)]
filtered_health.head()
Out[714]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 General health : Very bad health - Unit : Persons General health : Total\ General health - Unit : Persons General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 3902.0 505283.0 262901.0 170886.0 53987.0 13607.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 5453.0 621210.0 306910.0 215746.0 73386.0 19715.0
13 19.0 E10000006 Cumbria Counties CNTY 6481.0 499858.0 225018.0 172789.0 71966.0 23604.0
16 22.0 E10000009 Dorset Counties CNTY 4467.0 412905.0 184353.0 148166.0 59671.0 16248.0
17 23.0 E10000011 East Sussex Counties CNTY 6886.0 526671.0 230697.0 187695.0 77795.0 23598.0
In [715]:
filtered_health = filtered_health.drop(['General health : Total\ General health - Unit : Persons'], axis=1)
In [716]:
filtered_health
Out[716]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 General health : Very bad health - Unit : Persons General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 3902.0 262901.0 170886.0 53987.0 13607.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 5453.0 306910.0 215746.0 73386.0 19715.0
13 19.0 E10000006 Cumbria Counties CNTY 6481.0 225018.0 172789.0 71966.0 23604.0
16 22.0 E10000009 Dorset Counties CNTY 4467.0 184353.0 148166.0 59671.0 16248.0
17 23.0 E10000011 East Sussex Counties CNTY 6886.0 230697.0 187695.0 77795.0 23598.0
... ... ... ... ... ... ... ... ... ... ...
366 398.0 E09000031 Waltham Forest Local Authorities LA 3457.0 121652.0 91954.0 31133.0 10053.0
367 399.0 E09000032 Wandsworth Local Authorities LA 2814.0 176198.0 91935.0 27299.0 8749.0
368 400.0 E41000052 Cornwall, Isles of Scilly Local Authorities LA 7634.0 241894.0 179559.0 79429.0 25960.0
369 401.0 E41000324 City of London, Westminster Local Authorities LA 3547.0 122920.0 68181.0 22670.0 9453.0
8048 9937.0 E02000001 City of London 001 Middle Super Output Areas and Intermediate Zones MSOAIZ 56.0 4112.0 2374.0 643.0 190.0

112 rows × 10 columns

In [968]:
filtered_health.loc[filtered_health.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_health.loc[filtered_health.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_health.loc[filtered_health.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_health.loc[filtered_health.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1012]:
filtered_health = filtered_health.drop(['GEO_TYPE'], axis=1)
In [1013]:
filtered_health = filtered_health.drop(['GEO_TYP2'], axis=1)
In [1014]:
filtered_health.head()
Out[1014]:
GEO_CODE GEO_LABEL General health : Very bad health - Unit : Persons General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons
11 E10000002 Buckinghamshire 3902.0 262901.0 170886.0 53987.0 13607.0
12 E10000003 Cambridgeshire 5453.0 306910.0 215746.0 73386.0 19715.0
13 E10000006 Cumbria 6481.0 225018.0 172789.0 71966.0 23604.0
16 E10000009 Dorset 4467.0 184353.0 148166.0 59671.0 16248.0
17 E10000011 East Sussex 6886.0 230697.0 187695.0 77795.0 23598.0
In [ ]:
 
In [717]:
# Transform the data into a numpy ndarray
np_array_health = filtered_health.values
In [718]:
np_array_health
Out[718]:
array([[17.0, 'E10000002', 'Buckinghamshire', ..., 170886.0, 53987.0,
        13607.0],
       [18.0, 'E10000003', 'Cambridgeshire', ..., 215746.0, 73386.0,
        19715.0],
       [19.0, 'E10000006', 'Cumbria', ..., 172789.0, 71966.0, 23604.0],
       ...,
       [400.0, 'E41000052', 'Cornwall, Isles of Scilly', ..., 179559.0,
        79429.0, 25960.0],
       [401.0, 'E41000324', 'City of London, Westminster', ..., 68181.0,
        22670.0, 9453.0],
       [9937.0, 'E02000001', 'City of London 001', ..., 2374.0, 643.0,
        190.0]], dtype=object)
In [719]:
# Removing the first four columns
np_array_health = np_array_health[:, 5:]
In [720]:
# Building the PCA model
pca_health = PCA(n_components=2).fit(np_array_health) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_h = pca_health.transform(np_array_health) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_h variable has a shape of :", pca_x_h.shape)
print(pca_x_h)
The pca_x_h variable has a shape of : (112, 2)
[[ 6.18348840e+04  2.59038915e+04]
 [ 1.27073788e+05  1.61612315e+04]
 [ 3.86226089e+04 -1.04905590e+04]
 [-1.07044688e+04 -1.21606217e+04]
 [ 5.30627702e+04 -1.77836154e+04]
 [ 5.84223567e+05  1.08189695e+04]
 [ 1.06807712e+05  3.66308435e+03]
 [ 5.54540339e+05  3.64948289e+04]
 [ 4.37589347e+05  5.14250473e+04]
 [ 6.21735258e+05  5.24693498e+03]
 [ 4.34837699e+05 -4.36311156e+03]
 [ 1.39693375e+05  3.71340081e+03]
 [ 1.60120080e+05 -2.80061425e+04]
 [ 2.44869696e+05 -3.57370458e+04]
 [ 1.63371830e+05  3.45736899e+02]
 [ 1.06474151e+05  4.51043303e+03]
 [ 2.05214442e+05 -1.51846149e+04]
 [ 1.55364102e+05  3.79183012e+04]
 [ 6.00907338e+04 -8.55472002e+03]
 [ 2.47161100e+05 -1.08834678e+04]
 [ 1.80623782e+05 -7.15499553e+03]
 [ 4.56562484e+05  7.01296408e+04]
 [ 7.39276814e+04  2.32056630e+03]
 [ 2.32098496e+05  2.01553302e+03]
 [ 8.34600055e+04 -2.84844867e+03]
 [ 5.43398157e+05  2.49686101e+03]
 [ 5.23610806e+05 -2.34788056e+04]
 [ 3.80489835e+05 -2.06954069e+04]
 [ 1.33897143e+06 -4.63538145e+04]
 [ 1.06333290e+06 -4.25012132e+03]
 [-2.02777691e+05 -8.15612085e+03]
 [-1.74709843e+05 -6.04390154e+03]
 [-1.78291672e+05 -1.05364534e+04]
 [-1.41887262e+05 -4.67715022e+03]
 [-1.93023203e+05 -7.34482572e+03]
 [-1.81747924e+05 -4.33361346e+03]
 [-1.32046989e+05  2.61761106e+03]
 [-1.69080430e+05 -6.15467244e+03]
 [-1.07190859e+05 -1.28005583e+04]
 [-5.76043724e+04 -8.80090460e+03]
 [-1.61896276e+05 -9.10172324e+03]
 [-1.57069356e+05 -9.80387128e+03]
 [-1.33328298e+05  1.70627399e+03]
 [-1.08563498e+05 -8.28039092e+03]
 [-6.02835132e+04 -8.70072298e+03]
 [-2.31920891e+05 -4.24661462e+03]
 [-7.46604416e+04 -4.45450930e+03]
 [-1.45953369e+05 -6.42005612e+03]
 [-1.56587312e+05 -5.65283608e+03]
 [-1.13640000e+05 -1.51803607e+04]
 [-1.46582270e+05  2.38800564e+03]
 [ 4.05055059e+03  7.79563887e+03]
 [-1.34230078e+05 -5.15381800e+03]
 [-9.37041585e+04  2.84792071e+03]
 [-1.04181498e+05 -5.16149756e+03]
 [-1.80864467e+05 -1.14606071e+04]
 [-1.45935288e+05 -5.03277265e+03]
 [-1.66743907e+05 -4.99415350e+03]
 [-1.27545884e+05 -7.60373953e+02]
 [-1.45914998e+05 -9.02756505e+03]
 [-1.32553728e+05 -4.36988894e+03]
 [-1.52543699e+05 -7.32362985e+03]
 [-1.59255724e+05 -2.11546342e+03]
 [-9.70073209e+04 -6.42496752e+03]
 [-1.58465034e+05  3.06305783e+03]
 [-1.00882952e+05  5.31550477e+03]
 [-8.87067211e+04  4.16097650e+03]
 [-1.31088087e+05 -1.94176067e+03]
 [-1.11924132e+05 -1.59528882e+03]
 [-1.76136583e+05 -1.33468062e+04]
 [ 3.51891631e+04 -2.35045973e+04]
 [-7.15931101e+04 -1.23906449e+04]
 [-3.04967556e+04  6.09667851e+03]
 [-7.18883430e+04 -4.30794486e+03]
 [ 3.44303207e+04  1.22697635e+04]
 [-1.59758950e+05 -4.36288220e+03]
 [-2.12265282e+05 -5.16142328e+03]
 [-2.08261331e+05 -5.43299356e+03]
 [ 4.47511987e+04  1.16160556e+04]
 [-1.44163895e+05 -2.88812955e+03]
 [-1.14480850e+05 -8.68373826e+02]
 [-6.64932459e+04  1.97969329e+03]
 [-6.42637266e+04  7.41507141e+03]
 [-1.18531203e+05  1.22877362e+04]
 [-3.37047617e+04  2.33563371e+03]
 [-4.81622143e+04  6.27625875e+03]
 [-6.76420612e+04 -2.42091087e+03]
 [-9.96023611e+04  5.94242627e+03]
 [-1.04252241e+05  1.14336070e+04]
 [-1.39581790e+05  1.48682130e+04]
 [-9.97041146e+04  4.96118843e+03]
 [-1.09533688e+05 -1.59304915e+03]
 [-1.13248227e+05 -5.68859816e+03]
 [-8.71339752e+04  2.45611109e+03]
 [-9.93727959e+04  2.67194011e+03]
 [-1.29503051e+05  7.44778959e+03]
 [-1.53816586e+05  1.43567206e+04]
 [-1.54313762e+05  5.39685469e+03]
 [-6.60202879e+04  1.69421818e+04]
 [-8.70548481e+04  3.81217988e+03]
 [-1.30102752e+05  6.88217236e+03]
 [-6.81680475e+04  3.49075410e+03]
 [-8.55594509e+04  7.95093962e+02]
 [-1.34475898e+05  1.64782475e+04]
 [-7.53057634e+04  1.76809654e+04]
 [-1.38262316e+05 -3.74689475e+02]
 [-9.98083937e+04  7.81174411e+03]
 [-9.91611350e+04 -2.15840402e+03]
 [-5.79114223e+04  3.14000097e+04]
 [ 5.75924885e+04 -9.45491400e+03]
 [-1.14025188e+05  1.49061543e+04]
 [-2.50318766e+05 -5.11976991e+03]]
In [721]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_h[:,0], pca_x_h[:,1],cmap='viridis')
Out[721]:
<matplotlib.collections.PathCollection at 0x7f9782872730>
In [ ]:
 
In [722]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_health.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_health.columns[5+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_health.components_[1])[np.argsort(np.abs(pca_health.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_health.columns[5+np.argsort(np.abs(pca_health.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "General health : Very good health - Unit : Persons" has a loading of : 0.7749997899228033
The Column "General health : Good health - Unit : Persons" has a loading of : 0.5815112763067014
The Column "General health : Fair health - Unit : Persons" has a loading of : 0.2336889869509991
The Column "General health : Bad health - Unit : Persons" has a loading of : 0.07804187082866208
The Column "General health : Very bad health - Unit : Persons" has a loading of : 0.022779045703034573

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "General health : Fair health - Unit : Persons" has a loading of : -0.6083906561948577
The Column "General health : Very good health - Unit : Persons" has a loading of : 0.5641469610358348
The Column "General health : Good health - Unit : Persons" has a loading of : -0.4639786748985861
The Column "General health : Bad health - Unit : Persons" has a loading of : -0.2971689335921551
The Column "General health : Very bad health - Unit : Persons" has a loading of : -0.08951776335385532
In [723]:
filtered_health = filtered_health.drop(['CDU_ID'], axis=1)
In [724]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
health_num = filtered_health.select_dtypes(include='number') # selects numeric columns only
health_num.head()
Out[724]:
General health : Very bad health - Unit : Persons General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons
11 3902.0 262901.0 170886.0 53987.0 13607.0
12 5453.0 306910.0 215746.0 73386.0 19715.0
13 6481.0 225018.0 172789.0 71966.0 23604.0
16 4467.0 184353.0 148166.0 59671.0 16248.0
17 6886.0 230697.0 187695.0 77795.0 23598.0
In [725]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_health = cdist(health_num, health_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_health.shape)
The shape of this matrix is (112, 112)
In [726]:
mds_health = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_health = mds_health.fit_transform(dist_mat_health)
In [727]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_health[:, 0], mds_x_health[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_health['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_health[index, 0], mds_x_health[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');

Data on Deprivation on UK counties & Boroughs

The dimensions of deprivation used to classify households are indicators based on four selected household characteristics. A household is deprived in a dimension if they meet one or more of the following conditions:

  • Employment: Where any member of a household, who is not a full-time student, is either unemployed or long-term sick.

  • Education: No person in the household has at least Level 2 education (see highest level of qualification), and no person aged 16 to 18 is a full-time student.

  • Health and disability: Any person in the household has general health that is 'bad' or 'very bad' or has a long-term health problem.

  • Housing: The household's accommodation is either overcrowded, with an occupancy rating -1 or less, or is in a shared dwelling, or has no central heating. </ul>
    A household is classified as being deprived in none, or one to four of these dimensions in any combination.

In [728]:
df_deprivation=pd.read_csv("Deprevation in England Population.csv")
df_deprivation = df_deprivation.iloc[1: , :]
In [729]:
df_deprivation.head()
Out[729]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households Deprivation; classification of household [E][S][W] : Household is not deprived in any dimension - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 1 dimension - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 2 dimensions - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 3 dimensions - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 4 dimensions - Unit : Households
1 4.0 E92000001 England Countries and Groupings CTRY 22063368.0 9385648.0 7204181.0 4223982.0 1133622.0 115935.0
2 8.0 E12000001 North East Regions RGN 1129935.0 448560.0 356794.0 248597.0 71195.0 4789.0
3 9.0 E12000002 North West Regions RGN 3009549.0 1218768.0 953693.0 626702.0 192951.0 17435.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 2224059.0 909078.0 720188.0 457389.0 125936.0 11468.0
5 11.0 E12000004 East Midlands Regions RGN 1895604.0 811134.0 614226.0 372252.0 90406.0 7586.0
In [731]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [732]:
filtered_dep = df_deprivation[df_deprivation['GEO_CODE'].isin(GeoCodes)]
filtered_dep.head()
Out[732]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households Deprivation; classification of household [E][S][W] : Household is not deprived in any dimension - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 1 dimension - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 2 dimensions - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 3 dimensions - Unit : Households Deprivation; classification of household [E][S][W] : Household is deprived in 4 dimensions - Unit : Households
11 17.0 E10000002 Buckinghamshire Counties CNTY 200727.0 106894.0 61449.0 26862.0 5007.0 515.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 251241.0 123285.0 79821.0 39389.0 8067.0 679.0
13 19.0 E10000006 Cumbria Counties CNTY 222042.0 96017.0 72142.0 41985.0 10994.0 904.0
16 22.0 E10000009 Dorset Counties CNTY 180213.0 80487.0 62366.0 31001.0 5823.0 536.0
17 23.0 E10000011 East Sussex Counties CNTY 231905.0 98156.0 78965.0 43629.0 10054.0 1101.0
In [ ]:
 
In [733]:
Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households
        Deprivation; classification of household [E][S][W] : Household is not deprived in any dimension - Unit : Households
                Deprivation; classification of household [E][S][W] : Household is deprived in 1 dimension - Unit : Households
                        Deprivation; classification of household [E][S][W] : Household is deprived in 2 dimensions - Unit : Households
                                Deprivation; classification of household [E][S][W] : Household is deprived in 3 dimensions - Unit : Households
                                        Deprivation; classification of household [E][S][W] : Household is deprived in 4 dimensions - Unit : Households
  File "<ipython-input-733-680fa3ef9125>", line 1
    Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households
                                ^
SyntaxError: invalid syntax
In [735]:
filtered_dep.rename(columns = {'Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households':'Total households examined using deprivation indexes',
                                'Deprivation; classification of household [E][S][W] : Household is not deprived in any dimension - Unit : Households':'Number of households which are not deprived in any of the scrutinised dimensions',
                                'Deprivation; classification of household [E][S][W] : Household is deprived in 1 dimension - Unit : Households':'Number of households which are deprived by 1 of the scrutinised dimensions',
                                'Deprivation; classification of household [E][S][W] : Household is deprived in 2 dimensions - Unit : Households':'Number of households which are deprived by 2 of the scrutinised dimensions',
                                'Deprivation; classification of household [E][S][W] : Household is deprived in 3 dimensions - Unit : Households':'Number of households which are deprived by 3 of the scrutinised dimensions',
                                'Deprivation; classification of household [E][S][W] : Household is deprived in 4 dimensions - Unit : Households':'Number of households which are deprived by 4 of the scrutinised dimensions'}, inplace = True)
In [736]:
filtered_dep
Out[736]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Total households examined using deprivation indexes Number of households which are not deprived in any of the scrutinised dimensions Number of households which are deprived by 1 of the scrutinised dimensions Number of households which are deprived by 2 of the scrutinised dimensions Number of households which are deprived by 3 of the scrutinised dimensions Number of households which are deprived by 4 of the scrutinised dimensions
11 17.0 E10000002 Buckinghamshire Counties CNTY 200727.0 106894.0 61449.0 26862.0 5007.0 515.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 251241.0 123285.0 79821.0 39389.0 8067.0 679.0
13 19.0 E10000006 Cumbria Counties CNTY 222042.0 96017.0 72142.0 41985.0 10994.0 904.0
16 22.0 E10000009 Dorset Counties CNTY 180213.0 80487.0 62366.0 31001.0 5823.0 536.0
17 23.0 E10000011 East Sussex Counties CNTY 231905.0 98156.0 78965.0 43629.0 10054.0 1101.0
... ... ... ... ... ... ... ... ... ... ... ...
366 398.0 E09000031 Waltham Forest Local Authorities LA 96861.0 33217.0 33920.0 21764.0 7027.0 933.0
367 399.0 E09000032 Wandsworth Local Authorities LA 130493.0 65759.0 39677.0 18174.0 6064.0 819.0
368 400.0 E41000052 Cornwall, Isles of Scilly Local Authorities LA 231378.0 92952.0 80131.0 45994.0 11162.0 1139.0
369 401.0 E41000324 City of London, Westminster Local Authorities LA 110157.0 43480.0 38278.0 18840.0 7912.0 1647.0
8048 9937.0 E02000001 City of London 001 Middle Super Output Areas and Intermediate Zones MSOAIZ 4385.0 1973.0 1772.0 499.0 122.0 19.0

112 rows × 11 columns

In [969]:
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1015]:
filtered_dep
Out[1015]:
GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Total households examined using deprivation indexes Number of households which are not deprived in any of the scrutinised dimensions Number of households which are deprived by 1 of the scrutinised dimensions Number of households which are deprived by 2 of the scrutinised dimensions Number of households which are deprived by 3 of the scrutinised dimensions Number of households which are deprived by 4 of the scrutinised dimensions
11 E10000002 Buckinghamshire Counties CNTY 200727.0 106894.0 61449.0 26862.0 5007.0 515.0
12 E10000003 Cambridgeshire Counties CNTY 251241.0 123285.0 79821.0 39389.0 8067.0 679.0
13 E10000006 Cumbria Counties CNTY 222042.0 96017.0 72142.0 41985.0 10994.0 904.0
16 E10000009 Dorset Counties CNTY 180213.0 80487.0 62366.0 31001.0 5823.0 536.0
17 E10000011 East Sussex Counties CNTY 231905.0 98156.0 78965.0 43629.0 10054.0 1101.0
... ... ... ... ... ... ... ... ... ... ...
366 E09000031 Waltham Forest Local Authorities LA 96861.0 33217.0 33920.0 21764.0 7027.0 933.0
367 E09000032 Wandsworth Local Authorities LA 130493.0 65759.0 39677.0 18174.0 6064.0 819.0
368 E41000052 Cornwall Local Authorities LA 231378.0 92952.0 80131.0 45994.0 11162.0 1139.0
369 E41000324 Westminster Local Authorities LA 110157.0 43480.0 38278.0 18840.0 7912.0 1647.0
8048 E02000001 City of London Middle Super Output Areas and Intermediate Zones MSOAIZ 4385.0 1973.0 1772.0 499.0 122.0 19.0

112 rows × 10 columns

In [1016]:
filtered_dep = filtered_dep.drop(['GEO_TYPE'], axis=1)
In [1017]:
filtered_dep = filtered_dep.drop(['GEO_TYP2'], axis=1)
In [737]:
filtered_dep = filtered_dep.drop(['CDU_ID'], axis=1)
In [1019]:
filtered_dep.head()
Out[1019]:
GEO_CODE GEO_LABEL Total households examined using deprivation indexes Number of households which are not deprived in any of the scrutinised dimensions Number of households which are deprived by 1 of the scrutinised dimensions Number of households which are deprived by 2 of the scrutinised dimensions Number of households which are deprived by 3 of the scrutinised dimensions Number of households which are deprived by 4 of the scrutinised dimensions
11 E10000002 Buckinghamshire 200727.0 106894.0 61449.0 26862.0 5007.0 515.0
12 E10000003 Cambridgeshire 251241.0 123285.0 79821.0 39389.0 8067.0 679.0
13 E10000006 Cumbria 222042.0 96017.0 72142.0 41985.0 10994.0 904.0
16 E10000009 Dorset 180213.0 80487.0 62366.0 31001.0 5823.0 536.0
17 E10000011 East Sussex 231905.0 98156.0 78965.0 43629.0 10054.0 1101.0
In [ ]:
 
In [ ]:
 
In [738]:
# Transform the data into a numpy ndarray
np_array_dep = filtered_dep.values
In [739]:
np_array_dep
Out[739]:
array([['E10000002', 'Buckinghamshire', 'Counties', ..., 26862.0, 5007.0,
        515.0],
       ['E10000003', 'Cambridgeshire', 'Counties', ..., 39389.0, 8067.0,
        679.0],
       ['E10000006', 'Cumbria', 'Counties', ..., 41985.0, 10994.0, 904.0],
       ...,
       ['E41000052', 'Cornwall, Isles of Scilly', 'Local Authorities',
        ..., 45994.0, 11162.0, 1139.0],
       ['E41000324', 'City of London, Westminster', 'Local Authorities',
        ..., 18840.0, 7912.0, 1647.0],
       ['E02000001', 'City of London 001',
        'Middle Super Output Areas and Intermediate Zones', ..., 499.0,
        122.0, 19.0]], dtype=object)
In [740]:
# Removing the first four columns
np_array_dep = np_array_dep[:, 4:]
In [741]:
np_array_dep
Out[741]:
array([[200727.0, 106894.0, 61449.0, 26862.0, 5007.0, 515.0],
       [251241.0, 123285.0, 79821.0, 39389.0, 8067.0, 679.0],
       [222042.0, 96017.0, 72142.0, 41985.0, 10994.0, 904.0],
       [180213.0, 80487.0, 62366.0, 31001.0, 5823.0, 536.0],
       [231905.0, 98156.0, 78965.0, 43629.0, 10054.0, 1101.0],
       [581589.0, 255205.0, 193345.0, 107615.0, 23257.0, 2167.0],
       [254615.0, 122736.0, 81780.0, 40770.0, 8529.0, 800.0],
       [545254.0, 274142.0, 173320.0, 80170.0, 16139.0, 1483.0],
       [453817.0, 223886.0, 143859.0, 69844.0, 14694.0, 1534.0],
       [605638.0, 264980.0, 200070.0, 111252.0, 26476.0, 2860.0],
       [496299.0, 213445.0, 157329.0, 96828.0, 26275.0, 2422.0],
       [267434.0, 130582.0, 84605.0, 43817.0, 7884.0, 546.0],
       [306971.0, 127466.0, 103324.0, 61903.0, 13237.0, 1041.0],
       [372085.0, 153938.0, 126531.0, 74131.0, 16122.0, 1363.0],
       [287730.0, 131085.0, 93387.0, 50366.0, 11862.0, 1030.0],
       [256594.0, 122837.0, 83245.0, 41705.0, 8153.0, 654.0],
       [334303.0, 144243.0, 106363.0, 67077.0, 15535.0, 1085.0],
       [258855.0, 134689.0, 80088.0, 35844.0, 7483.0, 751.0],
       [226989.0, 101303.0, 76133.0, 40561.0, 8233.0, 759.0],
       [355263.0, 156703.0, 113504.0, 68922.0, 14985.0, 1149.0],
       [310745.0, 136271.0, 104965.0, 56586.0, 11900.0, 1023.0],
       [455791.0, 243546.0, 140441.0, 59140.0, 11503.0, 1161.0],
       [231005.0, 109373.0, 73346.0, 39454.0, 8194.0, 638.0],
       [345614.0, 160244.0, 115557.0, 56841.0, 11785.0, 1187.0],
       [239717.0, 109688.0, 77262.0, 42677.0, 9330.0, 760.0],
       [602087.0, 218490.0, 189669.0, 141216.0, 48380.0, 4332.0],
       [565442.0, 215540.0, 181274.0, 129512.0, 36324.0, 2792.0],
       [484527.0, 189036.0, 152664.0, 107866.0, 32597.0, 2364.0],
       [1086748.0, 382438.0, 356868.0, 261039.0, 78436.0, 7967.0],
       [922452.0, 369515.0, 298250.0, 190547.0, 57998.0, 6142.0],
       [40434.0, 14351.0, 12821.0, 9970.0, 3104.0, 188.0],
       [57203.0, 19729.0, 18615.0, 13717.0, 4789.0, 353.0],
       [59605.0, 22517.0, 19351.0, 13773.0, 3681.0, 283.0],
       [79159.0, 34306.0, 24692.0, 15755.0, 4114.0, 292.0],
       [46670.0, 20184.0, 14922.0, 8954.0, 2407.0, 203.0],
       [53312.0, 20060.0, 16642.0, 12238.0, 4105.0, 267.0],
       [85140.0, 40205.0, 25930.0, 14960.0, 3755.0, 290.0],
       [57353.0, 20599.0, 18421.0, 13105.0, 4779.0, 449.0],
       [112596.0, 37719.0, 37207.0, 27899.0, 9024.0, 747.0],
       [143032.0, 66055.0, 46722.0, 25187.0, 4665.0, 403.0],
       [69707.0, 26569.0, 23201.0, 15505.0, 4132.0, 300.0],
       [70684.0, 29434.0, 23407.0, 14508.0, 3116.0, 219.0],
       [83552.0, 41409.0, 26882.0, 12526.0, 2524.0, 211.0],
       [102271.0, 41618.0, 33029.0, 20996.0, 6054.0, 574.0],
       [123125.0, 40214.0, 41982.0, 29765.0, 9915.0, 1249.0],
       [15002.0, 8042.0, 4679.0, 1936.0, 332.0, 13.0],
       [126131.0, 43326.0, 42708.0, 29144.0, 9942.0, 1011.0],
       [78319.0, 34469.0, 26487.0, 13969.0, 3131.0, 263.0],
       [66608.0, 27793.0, 21348.0, 13548.0, 3673.0, 246.0],
       [107575.0, 36409.0, 33967.0, 27659.0, 8902.0, 638.0],
       [73515.0, 36446.0, 23106.0, 11348.0, 2351.0, 264.0],
       [182747.0, 77443.0, 58827.0, 34808.0, 10430.0, 1239.0],
       [88227.0, 41762.0, 28397.0, 14492.0, 3212.0, 364.0],
       [107538.0, 54100.0, 33900.0, 16216.0, 3042.0, 280.0],
       [109307.0, 43157.0, 36415.0, 22169.0, 6754.0, 812.0],
       [59010.0, 21315.0, 20367.0, 13148.0, 3627.0, 553.0],
       [82374.0, 33804.0, 27788.0, 15624.0, 4371.0, 787.0],
       [63530.0, 28688.0, 21222.0, 11125.0, 2264.0, 231.0],
       [88360.0, 40187.0, 28500.0, 15432.0, 3880.0, 361.0],
       [74023.0, 29667.0, 24589.0, 15215.0, 4115.0, 437.0],
       [74293.0, 26640.0, 26416.0, 16185.0, 4538.0, 514.0],
       [74678.0, 29872.0, 24948.0, 14927.0, 4381.0, 550.0],
       [62353.0, 24074.0, 21335.0, 13349.0, 3283.0, 312.0],
       [106209.0, 44119.0, 35652.0, 20314.0, 5550.0, 574.0],
       [62340.0, 33452.0, 18842.0, 8218.0, 1667.0, 161.0],
       [98584.0, 46086.0, 31678.0, 16220.0, 4169.0, 431.0],
       [121540.0, 52226.0, 39931.0, 21628.0, 6704.0, 1051.0],
       [85473.0, 35779.0, 28114.0, 16105.0, 4894.0, 581.0],
       [98254.0, 39116.0, 32798.0, 19691.0, 5871.0, 778.0],
       [61085.0, 23935.0, 21358.0, 12460.0, 2992.0, 340.0],
       [223803.0, 87986.0, 68885.0, 51846.0, 14353.0, 733.0],
       [138534.0, 60451.0, 44844.0, 26716.0, 6150.0, 373.0],
       [159441.0, 79623.0, 48842.0, 25289.0, 5287.0, 400.0],
       [129674.0, 59558.0, 42311.0, 22718.0, 4713.0, 374.0],
       [194194.0, 96645.0, 62496.0, 28845.0, 5720.0, 488.0],
       [63812.0, 29041.0, 20595.0, 11212.0, 2671.0, 293.0],
       [30744.0, 14941.0, 9756.0, 5074.0, 903.0, 70.0],
       [32758.0, 14780.0, 11023.0, 5642.0, 1205.0, 108.0],
       [204969.0, 70540.0, 68763.0, 45745.0, 18104.0, 1817.0],
       [69681.0, 19618.0, 24953.0, 18382.0, 6027.0, 701.0],
       [92604.0, 38457.0, 31937.0, 17839.0, 4048.0, 323.0],
       [110286.0, 34085.0, 40538.0, 25540.0, 8741.0, 1382.0],
       [130862.0, 63450.0, 41818.0, 20141.0, 4931.0, 522.0],
       [97534.0, 36975.0, 34569.0, 17469.0, 7085.0, 1436.0],
       [145010.0, 59399.0, 50369.0, 26731.0, 7457.0, 1054.0],
       [124082.0, 46377.0, 42757.0, 25522.0, 8212.0, 1214.0],
       [119916.0, 43308.0, 41080.0, 26163.0, 8172.0, 1193.0],
       [101045.0, 37600.0, 34480.0, 20809.0, 7239.0, 917.0],
       [101690.0, 31994.0, 36314.0, 22514.0, 9308.0, 1560.0],
       [80590.0, 33427.0, 27335.0, 13749.0, 5240.0, 839.0],
       [101955.0, 36356.0, 34280.0, 21949.0, 7968.0, 1402.0],
       [84268.0, 35196.0, 28917.0, 15813.0, 3892.0, 450.0],
       [97199.0, 38567.0, 33711.0, 20248.0, 4272.0, 401.0],
       [100214.0, 40143.0, 34824.0, 19622.0, 5029.0, 596.0],
       [94902.0, 35413.0, 33763.0, 19203.0, 5774.0, 749.0],
       [93556.0, 34289.0, 31880.0, 18351.0, 7762.0, 1274.0],
       [78536.0, 34278.0, 27161.0, 11908.0, 4366.0, 823.0],
       [63639.0, 32175.0, 20186.0, 8892.0, 2140.0, 246.0],
       [130017.0, 51814.0, 44100.0, 24158.0, 8654.0, 1291.0],
       [116091.0, 44523.0, 39640.0, 22963.0, 7871.0, 1094.0],
       [78757.0, 36641.0, 25594.0, 12685.0, 3457.0, 380.0],
       [101519.0, 25356.0, 38692.0, 26412.0, 9748.0, 1311.0],
       [99105.0, 38422.0, 34616.0, 20000.0, 5455.0, 612.0],
       [79835.0, 46005.0, 22322.0, 8994.0, 2237.0, 277.0],
       [120422.0, 43590.0, 42428.0, 24121.0, 8961.0, 1322.0],
       [78174.0, 35636.0, 26211.0, 12776.0, 3195.0, 356.0],
       [101257.0, 33144.0, 36199.0, 20987.0, 9432.0, 1495.0],
       [96861.0, 33217.0, 33920.0, 21764.0, 7027.0, 933.0],
       [130493.0, 65759.0, 39677.0, 18174.0, 6064.0, 819.0],
       [231378.0, 92952.0, 80131.0, 45994.0, 11162.0, 1139.0],
       [110157.0, 43480.0, 38278.0, 18840.0, 7912.0, 1647.0],
       [4385.0, 1973.0, 1772.0, 499.0, 122.0, 19.0]], dtype=object)
In [742]:
# Building the PCA model
pca_dep = PCA(n_components=2).fit(np_array_dep) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_d = pca_dep.transform(np_array_dep) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_h variable has a shape of :", pca_x_d.shape)
print(pca_x_d)
The pca_x_h variable has a shape of : (112, 2)
[[ 3.14940336e+04 -2.53921660e+04]
 [ 8.89104976e+04 -2.05737734e+04]
 [ 5.21786943e+04 -2.66398277e+03]
 [ 5.22154858e+03 -5.53591283e+03]
 [ 6.37003583e+04 -1.35327378e+03]
 [ 4.68692123e+05 -1.76482225e+04]
 [ 9.24738580e+04 -1.85248076e+04]
 [ 4.32920488e+05 -5.69636233e+04]
 [ 3.25108519e+05 -4.15433824e+04]
 [ 4.95836217e+05 -1.78679897e+04]
 [ 3.67568242e+05 -7.67847325e+03]
 [ 1.07752522e+05 -2.08488619e+04]
 [ 1.49839210e+05  1.04930239e+03]
 [ 2.24874189e+05  6.81196670e+02]
 [ 1.29447646e+05 -1.24556146e+04]
 [ 9.47937691e+04 -1.77298463e+04]
 [ 1.81539624e+05 -3.98782126e+03]
 [ 9.90319911e+04 -3.02604347e+04]
 [ 5.91079732e+04 -7.03822528e+03]
 [ 2.06561934e+05 -8.72119267e+03]
 [ 1.55717395e+05 -8.20407140e+03]
 [ 3.30822162e+05 -6.34935602e+04]
 [ 6.45084969e+04 -1.36170498e+04]
 [ 1.97677849e+05 -2.00324929e+04]
 [ 7.39431561e+04 -1.00121100e+04]
 [ 4.79645692e+05  3.84067692e+04]
 [ 4.41620085e+05  2.44459776e+04]
 [ 3.49562946e+05  1.64891511e+04]
 [ 1.03035259e+06  7.84278644e+04]
 [ 8.52517460e+05  1.27844140e+04]
 [-1.58000147e+05  6.25104580e+03]
 [-1.39088625e+05  7.82921886e+03]
 [-1.35837057e+05  5.85195714e+03]
 [-1.12712519e+05  1.54500239e+03]
 [-1.50108550e+05  2.21011270e+03]
 [-1.43212457e+05  5.75621849e+03]
 [-1.05205713e+05 -2.48050951e+03]
 [-1.38811109e+05  6.82803831e+03]
 [-7.64336092e+04  1.33985676e+04]
 [-3.78180970e+04 -5.38416747e+03]
 [-1.24175035e+05  5.77906442e+03]
 [-1.22470350e+05  2.92086809e+03]
 [-1.06387913e+05 -5.18290605e+03]
 [-8.65894382e+04  3.77869915e+03]
 [-6.46490276e+04  1.49176663e+04]
 [-1.86271527e+05  1.19018175e+03]
 [-6.08250260e+04  1.27230494e+04]
 [-1.13250224e+05  2.81794419e+02]
 [-1.27330535e+05  2.95722493e+03]
 [-8.22348434e+04  1.30857520e+04]
 [-1.18185105e+05 -4.01068461e+03]
 [ 6.25712810e+03  1.02583386e+02]
 [-1.01377366e+05 -3.31087580e+03]
 [-7.82987888e+04 -8.36677030e+03]
 [-7.87140791e+04  4.90245048e+03]
 [-1.36613372e+05  6.47473291e+03]
 [-1.09231008e+05  2.81858054e+03]
 [-1.30230218e+05  9.46998456e+01]
 [-1.01594696e+05 -1.37272064e+03]
 [-1.18970199e+05  4.05427707e+03]
 [-1.19109577e+05  7.29844995e+03]
 [-1.18263694e+05  3.96187315e+03]
 [-1.32424854e+05  4.97279689e+03]
 [-8.16752440e+04  2.26016093e+03]
 [-1.30782075e+05 -5.78652969e+03]
 [-8.95315209e+04 -3.48628671e+03]
 [-6.39265191e+04 -1.15302042e+02]
 [-1.05622976e+05  2.19677407e+03]
 [-9.12900276e+04  4.33135873e+03]
 [-1.33746063e+05  4.33454928e+03]
 [ 5.18573552e+04  9.62066592e+03]
 [-4.39209296e+04 -7.37760028e+02]
 [-1.80291516e+04 -1.28956093e+04]
 [-5.34559601e+04 -4.24311859e+03]
 [ 2.28225781e+04 -1.70743096e+04]
 [-1.29998659e+05 -5.13835140e+01]
 [-1.68076032e+05  7.12033159e+02]
 [-1.65906803e+05  1.68661651e+03]
 [ 2.82730678e+04  1.80281498e+04]
 [-1.25581327e+05  1.34418497e+04]
 [-9.71119577e+04  2.36630339e+03]
 [-7.92438741e+04  1.49442299e+04]
 [-5.16185161e+04 -8.44918700e+03]
 [-9.25217136e+04  5.25901749e+03]
 [-3.70353072e+04  2.12723102e+03]
 [-6.22379256e+04  7.64786838e+03]
 [-6.73222110e+04  9.51446609e+03]
 [-8.86609222e+04  7.11389315e+03]
 [-8.91792867e+04  1.32765729e+04]
 [-1.11340499e+05  2.01825346e+03]
 [-8.81271278e+04  9.01869667e+03]
 [-1.06757452e+05  2.12605750e+03]
 [-9.21288268e+04  4.55323599e+03]
 [-8.87009197e+04  3.80825547e+03]
 [-9.53553961e+04  6.51349383e+03]
 [-9.75118642e+04  7.05543237e+03]
 [-1.13246982e+05 -2.01373970e+02]
 [-1.29585550e+05 -3.94283759e+03]
 [-5.49694836e+04  3.90302863e+03]
 [-7.12126974e+04  6.01477232e+03]
 [-1.12556505e+05 -2.03521268e+03]
 [-9.03150596e+04  2.08766394e+04]
 [-9.02521179e+04  5.26521524e+03]
 [-1.09909426e+05 -1.18302479e+04]
 [-6.67291369e+04  8.66420643e+03]
 [-1.13247054e+05 -1.29532762e+03]
 [-8.94455166e+04  1.15201670e+04]
 [-9.38690693e+04  1.02528527e+04]
 [-5.20114245e+04 -1.12857078e+04]
 [ 6.21860204e+04  4.30713687e+03]
 [-7.78716613e+04  3.57206215e+03]
 [-1.98773887e+05  3.08488900e+03]]
In [743]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_d[:,0], pca_x_d[:,1],cmap='viridis')
Out[743]:
<matplotlib.collections.PathCollection at 0x7f97a2456670>
In [744]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_dep.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_dep.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_dep.components_[1])[np.argsort(np.abs(pca_dep.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_dep.columns[4+np.argsort(np.abs(pca_dep.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(6):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(6):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Total households examined using deprivation indexes" has a loading of : 0.8710402758652738
The Column "Number of households which are not deprived in any of the scrutinised dimensions" has a loading of : 0.35761435527104735
The Column "Number of households which are deprived by 1 of the scrutinised dimensions" has a loading of : 0.28079828501515935
The Column "Number of households which are deprived by 2 of the scrutinised dimensions" has a loading of : 0.17930261065228725
The Column "Number of households which are deprived by 3 of the scrutinised dimensions" has a loading of : 0.048820225891445156
The Column "Number of households which are deprived by 4 of the scrutinised dimensions" has a loading of : 0.004504799035334244

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Number of households which are not deprived in any of the scrutinised dimensions" has a loading of : -0.7805151914265704
The Column "Number of households which are deprived by 2 of the scrutinised dimensions" has a loading of : 0.5290700434818134
The Column "Number of households which are deprived by 3 of the scrutinised dimensions" has a loading of : 0.2673172296269874
The Column "Total households examined using deprivation indexes" has a loading of : 0.15934462426612084
The Column "Number of households which are deprived by 1 of the scrutinised dimensions" has a loading of : 0.11497704641678935
The Column "Number of households which are deprived by 4 of the scrutinised dimensions" has a loading of : 0.028495496167100984
In [745]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
dep_num = filtered_dep.select_dtypes(include='number') # selects numeric columns only
dep_num.head()
Out[745]:
Total households examined using deprivation indexes Number of households which are not deprived in any of the scrutinised dimensions Number of households which are deprived by 1 of the scrutinised dimensions Number of households which are deprived by 2 of the scrutinised dimensions Number of households which are deprived by 3 of the scrutinised dimensions Number of households which are deprived by 4 of the scrutinised dimensions
11 200727.0 106894.0 61449.0 26862.0 5007.0 515.0
12 251241.0 123285.0 79821.0 39389.0 8067.0 679.0
13 222042.0 96017.0 72142.0 41985.0 10994.0 904.0
16 180213.0 80487.0 62366.0 31001.0 5823.0 536.0
17 231905.0 98156.0 78965.0 43629.0 10054.0 1101.0
In [746]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_dep = cdist(dep_num, dep_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_dep.shape)
The shape of this matrix is (112, 112)
In [747]:
mds_dep = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_dep = mds_dep.fit_transform(dist_mat_dep)
In [970]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_dep[:, 0], mds_x_dep[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_dep['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_dep[index, 0], mds_x_dep[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
In [ ]:
 

Unpaid Carers

Description: A person is a provider of unpaid care if they look after or give help or support to family members, friends, neighbours or others because of long-term physical or mental ill-health or disability, or problems related to old age. This does not include any activities as part of paid employment. No distinction is made about whether any care that a person provides is within their own household or outside the household, so no explicit link can be made about whether the care provided is for a person within the household who has poor general health or a long-term health problem or disability.

In [749]:
df_uncar=pd.read_csv("Data on Unpaid Carers.csv")
df_uncar = df_uncar.iloc[1: , :]
In [750]:
df_uncar.head()
Out[750]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Care (unpaid); provision of : Provides no unpaid care - Unit : Persons Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons
1 4.0 E92000001 England Countries and Groupings CTRY 47582440.0 3452636.0 721143.0 1256237.0 NaN
2 8.0 E12000001 North East Regions RGN 2310535.0 165828.0 41778.0 78745.0 2596886.0
3 9.0 E12000002 North West Regions RGN 6270205.0 469493.0 113003.0 199476.0 7052177.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 4732392.0 341658.0 74574.0 135109.0 5283733.0
5 11.0 E12000004 East Midlands Regions RGN 4042973.0 311813.0 63603.0 114833.0 4533222.0
In [751]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [752]:
filtered_uncar = df_uncar[df_uncar['GEO_CODE'].isin(GeoCodes)]
filtered_uncar.head()
Out[752]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Care (unpaid); provision of : Provides no unpaid care - Unit : Persons Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 455769.0 35820.0 5268.0 8426.0 505283.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 561034.0 41313.0 6785.0 12078.0 621210.0
13 19.0 E10000006 Cumbria Counties CNTY 443363.0 35927.0 7265.0 13303.0 499858.0
16 22.0 E10000009 Dorset Counties CNTY 363583.0 33362.0 5456.0 10504.0 412905.0
17 23.0 E10000011 East Sussex Counties CNTY 467262.0 39537.0 6745.0 13127.0 526671.0
In [753]:
filtered_uncar = filtered_uncar.drop(['CDU_ID'], axis=1)
In [754]:
# Transform the data into a numpy ndarray
np_array_uncar = filtered_uncar.values
In [755]:
np_array_uncar
Out[755]:
array([['E10000002', 'Buckinghamshire', 'Counties', ..., 5268.0, 8426.0,
        505283.0],
       ['E10000003', 'Cambridgeshire', 'Counties', ..., 6785.0, 12078.0,
        621210.0],
       ['E10000006', 'Cumbria', 'Counties', ..., 7265.0, 13303.0,
        499858.0],
       ...,
       ['E41000052', 'Cornwall, Isles of Scilly', 'Local Authorities',
        ..., 8304.0, 15891.0, 534476.0],
       ['E41000324', 'City of London, Westminster', 'Local Authorities',
        ..., 2587.0, 3496.0, 226771.0],
       ['E02000001', 'City of London 001',
        'Middle Super Output Areas and Intermediate Zones', ..., 51.0,
        70.0, 7375.0]], dtype=object)
In [971]:
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1020]:
filtered_uncar = filtered_uncar.drop(['GEO_TYP2'], axis=1)
In [1021]:
filtered_uncar = filtered_uncar.drop(['GEO_TYPE'], axis=1)
In [1022]:
filtered_uncar.head()
Out[1022]:
GEO_CODE GEO_LABEL Care (unpaid); provision of : Provides no unpaid care - Unit : Persons Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons
11 E10000002 Buckinghamshire 455769.0 35820.0 5268.0 8426.0 505283.0
12 E10000003 Cambridgeshire 561034.0 41313.0 6785.0 12078.0 621210.0
13 E10000006 Cumbria 443363.0 35927.0 7265.0 13303.0 499858.0
16 E10000009 Dorset 363583.0 33362.0 5456.0 10504.0 412905.0
17 E10000011 East Sussex 467262.0 39537.0 6745.0 13127.0 526671.0
In [ ]:
 
In [756]:
# Removing the first four columns
np_array_uncar = np_array_uncar[:, 4:]
In [757]:
# Building the PCA model
pca_uncar = PCA(n_components=2).fit(np_array_uncar) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_uncar = pca_uncar.transform(np_array_uncar) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_h variable has a shape of :", pca_x_uncar.shape)
print(pca_x_uncar)
The pca_x_h variable has a shape of : (112, 2)
[[ 1.08449060e+05 -1.19072404e+03]
 [ 2.65159214e+05 -3.32345871e+03]
 [ 9.62818857e+04  5.37014594e+03]
 [-2.17403550e+04  8.23526176e+03]
 [ 1.32316778e+05  6.02279642e+03]
 [ 1.30014130e+06  3.39179191e+02]
 [ 2.29480597e+05  2.12636144e+03]
 [ 1.20173646e+06 -3.70498673e+03]
 [ 9.32550641e+05 -7.01583479e+03]
 [ 1.39546988e+06 -2.32574548e+03]
 [ 9.94794250e+05  9.92602180e+03]
 [ 2.99840085e+05  4.50883206e+03]
 [ 3.83352995e+05  4.75671764e+03]
 [ 5.77081114e+05  5.27719347e+03]
 [ 3.59016546e+05 -1.79274164e+03]
 [ 2.30097211e+05  4.11049106e+03]
 [ 4.77955567e+05  9.03498249e+03]
 [ 3.10563956e+05 -5.40011797e+03]
 [ 1.37628068e+05  4.68156007e+03]
 [ 5.61268144e+05  1.07783610e+04]
 [ 4.04896037e+05  2.76821340e+03]
 [ 9.55795814e+05 -7.96823324e+03]
 [ 1.58943527e+05  3.82702952e+03]
 [ 5.11803180e+05  1.66572099e+03]
 [ 1.85327140e+05  6.09791451e+03]
 [ 1.26974578e+06  1.67054170e+04]
 [ 1.22760200e+06  5.18366630e+03]
 [ 9.09423029e+05  1.37084794e+03]
 [ 3.10070757e+06 -3.29333031e+03]
 [ 2.42460155e+06 -1.33425203e+04]
 [-4.49512476e+05  2.07717883e+03]
 [-3.86792595e+05  1.16272917e+03]
 [-3.92519034e+05  3.54321239e+03]
 [-3.15391477e+05  1.37351126e+03]
 [-4.31106760e+05  1.99258763e+03]
 [-4.05189494e+05  3.42911362e+03]
 [-3.01593581e+05  2.47898595e+03]
 [-3.74990949e+05  1.81881654e+03]
 [-2.26413591e+05 -2.33306920e+03]
 [-1.25279917e+05  4.16009784e+03]
 [-3.58031537e+05  9.07230449e+02]
 [-3.48331505e+05  2.41634971e+03]
 [-3.05174397e+05 -4.33094207e+02]
 [-2.38368128e+05  1.00997268e+03]
 [-1.27483201e+05 -3.05102025e+03]
 [-5.22686766e+05  2.19657515e+03]
 [-1.59013791e+05 -4.30011205e+03]
 [-3.27268069e+05  3.62196646e+03]
 [-3.49354835e+05  2.06937005e+03]
 [-2.39190172e+05  2.34742484e+03]
 [-3.35837359e+05  1.33363645e+03]
 [ 5.56802817e+03 -3.57684920e+03]
 [-3.01378544e+05  3.23166702e+03]
 [-2.19821548e+05  2.30729231e+03]
 [-2.28687246e+05  1.56883121e+03]
 [-3.98507392e+05  4.14436494e+03]
 [-3.25167132e+05 -4.80742643e+01]
 [-3.74997856e+05  2.86949545e+03]
 [-2.90343278e+05 -7.42789455e+02]
 [-3.25224903e+05  6.36358811e+01]
 [-2.97994289e+05 -1.57519108e+03]
 [-3.39255295e+05  1.42544948e+03]
 [-3.59846270e+05 -2.79834495e+02]
 [-2.16639734e+05 -1.27222635e+03]
 [-3.65058017e+05  4.09056828e+02]
 [-2.35872218e+05 -2.71116572e+03]
 [-2.02590796e+05 -2.85783314e+03]
 [-2.94656661e+05 -2.71026042e+03]
 [-2.51818265e+05 -2.98608157e+03]
 [-3.88375256e+05  3.90974093e+03]
 [ 1.12792289e+05  6.43023619e+03]
 [-1.49854578e+05  4.13448727e+03]
 [-7.62042150e+04  3.31900477e+03]
 [-1.62862786e+05  4.14961699e+03]
 [ 6.12374286e+04  2.69431959e+02]
 [-3.61050428e+05  1.63503871e+03]
 [-4.78357267e+05  3.98138813e+03]
 [-4.68672470e+05  2.55871633e+03]
 [ 1.09441574e+05 -1.06869588e+04]
 [-3.21115699e+05 -2.10650576e+03]
 [-2.60682736e+05  8.82446501e+02]
 [-1.50975511e+05 -5.23318682e+03]
 [-1.56208489e+05  5.95809403e+02]
 [-2.73200129e+05 -4.11564890e+03]
 [-8.18083362e+04 -3.04341205e+03]
 [-1.13920740e+05 -5.85812188e+03]
 [-1.49814365e+05 -3.91654993e+03]
 [-2.27958170e+05 -3.48103175e+03]
 [-2.37084984e+05 -6.73076870e+03]
 [-3.23451046e+05 -5.18516774e+03]
 [-2.25453237e+05 -6.53652842e+03]
 [-2.51412583e+05  1.16198940e+03]
 [-2.54355592e+05  2.10731860e+03]
 [-2.03045414e+05 -1.33016897e+03]
 [-2.28971428e+05 -2.98188353e+03]
 [-2.92642098e+05 -3.81712403e+03]
 [-3.56203342e+05 -3.82665732e+03]
 [-3.55649084e+05 -1.45443689e+03]
 [-1.58547141e+05 -1.01830279e+04]
 [-1.98190747e+05 -5.17572811e+03]
 [-3.01704910e+05 -2.95811839e+03]
 [-1.54320501e+05 -7.39419463e+03]
 [-1.96829062e+05 -6.10573062e+02]
 [-3.19229505e+05 -1.55686562e+03]
 [-1.79594491e+05 -8.49849680e+03]
 [-3.16377509e+05  3.76594620e+02]
 [-2.26957859e+05 -6.35606972e+03]
 [-2.22280077e+05 -4.59182823e+03]
 [-1.52713150e+05 -1.09754045e+04]
 [ 1.40714288e+05  8.39621888e+03]
 [-2.63617815e+05 -5.91146582e+03]
 [-5.62960798e+05  2.09791414e+03]]
In [758]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_uncar[:,0], pca_x_uncar[:,1],cmap='viridis')
Out[758]:
<matplotlib.collections.PathCollection at 0x7f97b28fcf70>
In [759]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_uncar.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_uncar.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_uncar.components_[1])[np.argsort(np.abs(pca_uncar.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_uncar.columns[4+np.argsort(np.abs(pca_uncar.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons" has a loading of : 0.7450075442779435
The Column "Care (unpaid); provision of : Provides no unpaid care - Unit : Persons" has a loading of : 0.6648239263505321
The Column "Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons" has a loading of : 0.04972547442500246
The Column "Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons" has a loading of : 0.019497201389133176
The Column "Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons" has a loading of : 0.010960942113275944

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons" has a loading of : 0.5733948523975633
The Column "Care (unpaid); provision of : Provides no unpaid care - Unit : Persons" has a loading of : -0.5715387821651857
The Column "Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons" has a loading of : 0.4610468646875205
The Column "Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons" has a loading of : 0.34485479907997496
The Column "Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons" has a loading of : 0.11433599537516563
In [760]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
uncar_num = filtered_uncar.select_dtypes(include='number') # selects numeric columns only
uncar_num.head()
Out[760]:
Care (unpaid); provision of : Provides no unpaid care - Unit : Persons Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons
11 455769.0 35820.0 5268.0 8426.0 505283.0
12 561034.0 41313.0 6785.0 12078.0 621210.0
13 443363.0 35927.0 7265.0 13303.0 499858.0
16 363583.0 33362.0 5456.0 10504.0 412905.0
17 467262.0 39537.0 6745.0 13127.0 526671.0
In [761]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_uncar = cdist(uncar_num, uncar_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_uncar.shape)
The shape of this matrix is (112, 112)
In [762]:
mds_uncar = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_uncar = mds_uncar.fit_transform(dist_mat_uncar)
In [972]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_uncar[:, 0], mds_x_uncar[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_uncar['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_uncar[index, 0], mds_x_uncar[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
In [ ]:
 

Economic Activity

Economic activity relates to whether or not a person who was aged 16 and over was working or looking for work in the week before census. Rather than a simple indicator of whether or not someone was currently in employment, it provides a measure of whether or not a person was an active participant in the labour market.
A person's economic activity is derived from their 'activity last week'. This is an indicator of their status or availability for employment - whether employed, unemployed, or their status if not employed and not seeking employment. Additional information included in the economic activity classification is also derived from information about the number of hours a person works and their type of employment - whether employed or self-employed.
The census concept of economic activity is compatible with the standard for economic status defined by the <a href=http://www.ilo.org/global/statistics-and-databases/classifications/lang--en/index.htm" target="_blank">International Labour Organisation (ILO)</a>. It is one of a number of definitions used internationally to produce accurate and comparable statistics on employment

In [764]:
df_ea=pd.read_csv("Economic activity dataset in England .csv")
df_ea = df_ea.iloc[1: , :]
In [765]:
df_ea.head()
Out[765]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Economic activity of household reference persons : Total\ Economic activity of Household Reference Persons - Unit : Persons Economic activity of household reference persons : Economically active - Unit : Persons Economic activity of household reference persons : Economically active\ Employee\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Employee\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed with employees\ Part-time - Unit : Persons ... Economic activity of household reference persons : Economically active\ Self-employed without employees\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed without employees\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Unemployed - Unit : Persons Economic activity of household reference persons : Economically active\ Full-time student - Unit : Persons Economic activity of household reference persons : Economically inactive - Unit : Persons Economic activity of household reference persons : Economically inactive\ Retired - Unit : Persons Economic activity of household reference persons : Economically inactive\ Student (including full-time students) - Unit : Persons Economic activity of household reference persons : Economically inactive\ Looking after home or family - Unit : Persons Economic activity of household reference persons : Economically inactive\ Long-term sick or disabled - Unit : Persons Economic activity of household reference persons : Economically inactive\ Other - Unit : Persons
1 4.0 E92000001 England Countries and Groupings CTRY 22063368.0 14842348.0 1945412.0 9456015.0 75031.0 ... 465116.0 1437357.0 682994.0 218945.0 7221020.0 5501846.0 194600.0 400365.0 800557.0 323652.0
2 8.0 E12000001 North East Regions RGN 1129935.0 706095.0 106043.0 457168.0 2797.0 ... 14677.0 47367.0 45026.0 10818.0 423840.0 315447.0 10530.0 23808.0 56137.0 17918.0
3 9.0 E12000002 North West Regions RGN 3009549.0 1936875.0 282641.0 1230280.0 9503.0 ... 52663.0 159811.0 103457.0 29549.0 1072674.0 784206.0 26794.0 57942.0 155600.0 48132.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 2224059.0 1465365.0 207771.0 929335.0 7205.0 ... 39473.0 123603.0 79698.0 25022.0 758694.0 580652.0 19691.0 39746.0 85629.0 32976.0
5 11.0 E12000004 East Midlands Regions RGN 1895604.0 1265971.0 167239.0 829970.0 5568.0 ... 33744.0 111499.0 54919.0 17630.0 629633.0 495276.0 16077.0 30543.0 63603.0 24134.0

5 rows × 21 columns

In [766]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [767]:
filtered_ea = df_ea[df_ea['GEO_CODE'].isin(GeoCodes)]
filtered_ea.head()
Out[767]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Economic activity of household reference persons : Total\ Economic activity of Household Reference Persons - Unit : Persons Economic activity of household reference persons : Economically active - Unit : Persons Economic activity of household reference persons : Economically active\ Employee\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Employee\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed with employees\ Part-time - Unit : Persons ... Economic activity of household reference persons : Economically active\ Self-employed without employees\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed without employees\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Unemployed - Unit : Persons Economic activity of household reference persons : Economically active\ Full-time student - Unit : Persons Economic activity of household reference persons : Economically inactive - Unit : Persons Economic activity of household reference persons : Economically inactive\ Retired - Unit : Persons Economic activity of household reference persons : Economically inactive\ Student (including full-time students) - Unit : Persons Economic activity of household reference persons : Economically inactive\ Looking after home or family - Unit : Persons Economic activity of household reference persons : Economically inactive\ Long-term sick or disabled - Unit : Persons Economic activity of household reference persons : Economically inactive\ Other - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 200727.0 144378.0 15009.0 94016.0 811.0 ... 5746.0 17263.0 3442.0 1133.0 56349.0 48310.0 691.0 2300.0 3262.0 1786.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 251241.0 178693.0 18660.0 123735.0 725.0 ... 5192.0 17084.0 4742.0 1889.0 72548.0 59802.0 2034.0 2885.0 5565.0 2262.0
13 19.0 E10000006 Cumbria Counties CNTY 222042.0 141248.0 20349.0 88574.0 790.0 ... 4286.0 14627.0 4911.0 1022.0 80794.0 66710.0 648.0 2558.0 8429.0 2449.0
16 22.0 E10000009 Dorset Counties CNTY 180213.0 108380.0 15986.0 64329.0 672.0 ... 5068.0 13793.0 2766.0 488.0 71833.0 63692.0 364.0 1956.0 4150.0 1671.0
17 23.0 E10000011 East Sussex Counties CNTY 231905.0 143576.0 20818.0 81453.0 872.0 ... 7117.0 19946.0 5507.0 1168.0 88329.0 73832.0 925.0 3415.0 7437.0 2720.0

5 rows × 21 columns

In [973]:
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1023]:
filtered_ea = filtered_ea.drop(['GEO_TYP2'], axis=1)
In [1024]:
filtered_ea = filtered_ea.drop(['GEO_TYPE'], axis=1)
In [ ]:
 
In [768]:
filtered_ea = filtered_ea.drop(['CDU_ID'], axis=1)
In [769]:
# Transform the data into a numpy ndarray
np_array_ea = filtered_ea.values
In [770]:
np_array_ea
Out[770]:
array([['E10000002', 'Buckinghamshire', 'Counties', ..., 2300.0, 3262.0,
        1786.0],
       ['E10000003', 'Cambridgeshire', 'Counties', ..., 2885.0, 5565.0,
        2262.0],
       ['E10000006', 'Cumbria', 'Counties', ..., 2558.0, 8429.0, 2449.0],
       ...,
       ['E41000052', 'Cornwall, Isles of Scilly', 'Local Authorities',
        ..., 3349.0, 7877.0, 2629.0],
       ['E41000324', 'City of London, Westminster', 'Local Authorities',
        ..., 2448.0, 6132.0, 3317.0],
       ['E02000001', 'City of London 001',
        'Middle Super Output Areas and Intermediate Zones', ..., 19.0,
        70.0, 48.0]], dtype=object)
In [771]:
# Removing the first four columns
np_array_ea = np_array_ea[:, 4:]
In [772]:
# Building the PCA model
pca_ea = PCA(n_components=2).fit(np_array_ea) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_ea = pca_ea.transform(np_array_ea) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_ea variable has a shape of :", pca_x_ea.shape)
print(pca_x_ea)
The pca_x_ea variable has a shape of : (112, 2)
[[ 3.60498861e+04 -1.63302863e+04]
 [ 1.06319145e+05 -2.20570325e+04]
 [ 5.90549119e+04  1.46536181e+04]
 [ 9.96664536e+02  2.62289027e+04]
 [ 6.89679646e+04  2.51173967e+04]
 [ 5.47293641e+05 -6.67801164e+03]
 [ 1.07186651e+05 -6.70471748e+03]
 [ 5.06490061e+05 -2.90722520e+04]
 [ 3.82089532e+05 -4.44643993e+04]
 [ 5.78358504e+05 -5.54659568e+03]
 [ 4.28005931e+05  1.32922906e+04]
 [ 1.26280697e+05 -9.98186888e+03]
 [ 1.74510052e+05  1.45766373e+04]
 [ 2.59708191e+05  2.42689191e+04]
 [ 1.55649831e+05 -2.59809438e+04]
 [ 1.06948331e+05  6.74952421e+03]
 [ 2.13553070e+05  3.17453704e+03]
 [ 1.16419293e+05 -2.57020383e+04]
 [ 6.61014408e+04  1.13457073e+04]
 [ 2.42804486e+05 -8.14940069e+02]
 [ 1.81439176e+05  4.23225152e+03]
 [ 3.85355590e+05 -4.05389890e+04]
 [ 7.57243444e+04 -6.47393788e+03]
 [ 2.27891970e+05  6.10799260e+03]
 [ 8.59014919e+04 -4.95293797e+02]
 [ 5.57027276e+05  5.32628550e+04]
 [ 5.18314402e+05  1.69125920e+04]
 [ 4.08326441e+05  2.59013781e+04]
 [ 1.21142611e+06  1.94323345e+04]
 [ 1.00363517e+06 -2.19459242e+04]
 [-1.85335458e+05  9.27391289e+03]
 [-1.63648427e+05  1.09197156e+04]
 [-1.59871086e+05  1.29045766e+04]
 [-1.31982597e+05  4.97576855e+03]
 [-1.75883455e+05  5.97461052e+03]
 [-1.67144945e+05  5.76375349e+03]
 [-1.22438783e+05 -2.81752814e+02]
 [-1.62576232e+05  6.68063973e+03]
 [-8.84871158e+04  5.33145531e+03]
 [-4.58803189e+04  1.02066965e+04]
 [-1.45402314e+05  7.62380284e+03]
 [-1.42910973e+05  4.90750234e+03]
 [-1.25447635e+05  2.55268656e+03]
 [-1.00936116e+05  3.51847750e+03]
 [-7.46804384e+04  9.09738905e+02]
 [-2.18509447e+05  5.96850265e+03]
 [-7.23117740e+04  6.24897838e+03]
 [-1.33919384e+05  7.41103338e+03]
 [-1.48214400e+05  1.76369344e+03]
 [-9.53551621e+04  9.35776288e+03]
 [-1.40056929e+05  5.18855344e+03]
 [ 7.88289785e+03 -8.56618165e+03]
 [-1.19974971e+05  7.49093356e+03]
 [-9.09782742e+04 -4.74620917e+03]
 [-9.21384351e+04  6.24576655e+03]
 [-1.61301704e+05  1.45680394e+04]
 [-1.28671636e+05  5.56885217e+03]
 [-1.53210692e+05  7.54973325e+03]
 [-1.16758147e+05 -7.36148066e+03]
 [-1.37993009e+05 -4.55003917e+02]
 [-1.38717970e+05 -3.49522483e+02]
 [-1.38430914e+05  5.49648314e+03]
 [-1.53551874e+05 -1.26378118e+03]
 [-9.44377235e+04 -2.46649660e+03]
 [-1.52639812e+05 -3.71010059e+03]
 [-1.02276774e+05 -1.36783656e+04]
 [-7.56984617e+04 -4.30140092e+03]
 [-1.23695729e+05  1.14785931e+03]
 [-1.06429794e+05 -3.86290365e+02]
 [-1.58272497e+05  1.43609426e+04]
 [ 6.06640280e+04  1.80989692e+04]
 [-5.28494994e+04  1.20664370e+04]
 [-2.29108308e+04  3.81774053e+03]
 [-6.41755887e+04  6.04838643e+03]
 [ 2.62377800e+04 -7.48828232e+03]
 [-1.52043672e+05  1.25828839e+03]
 [-1.97623591e+05  7.25673184e+03]
 [-1.94822500e+05  5.95409374e+03]
 [ 3.26738183e+04 -2.04138873e+03]
 [-1.46274273e+05  3.91203109e+03]
 [-1.13575852e+05  2.58227801e+03]
 [-9.10145933e+04 -6.82633705e+03]
 [-6.13597835e+04 -1.53232868e+03]
 [-1.08301395e+05 -5.56519715e+03]
 [-4.21634627e+04 -1.14719567e+04]
 [-7.05516307e+04 -1.16861940e+04]
 [-7.93451035e+04  6.58254372e+02]
 [-1.02595376e+05 -5.19803157e+03]
 [-1.03153427e+05 -7.68167454e+03]
 [-1.29127432e+05 -9.28181069e+03]
 [-1.02214040e+05 -7.53565610e+03]
 [-1.24703317e+05 -2.33312155e+03]
 [-1.07976722e+05  6.17933637e+03]
 [-1.02564898e+05 -4.07545406e+03]
 [-1.09397487e+05 -8.99587569e+03]
 [-1.13208288e+05 -6.76335388e+03]
 [-1.33297614e+05 -5.09099052e+03]
 [-1.51720170e+05 -3.95860140e+03]
 [-6.09464607e+04 -2.30544139e+04]
 [-8.15939036e+04 -1.15569300e+04]
 [-1.30496804e+05 -8.54440795e+03]
 [-1.03513504e+05 -6.34781684e+03]
 [-1.05271878e+05 -3.91001412e+03]
 [-1.29000149e+05 -8.15382082e+03]
 [-7.50678786e+04 -1.67895771e+04]
 [-1.31898490e+05 -3.55542936e+03]
 [-1.02082472e+05 -1.21936483e+04]
 [-1.08418370e+05 -5.46660984e+03]
 [-5.76722276e+04 -2.84717248e+04]
 [ 6.76486325e+04  2.57407571e+04]
 [-9.10404921e+04 -7.92419202e+03]
 [-2.32792828e+05  5.10794408e+03]]
In [773]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_ea[:,0], pca_x_ea[:,1],cmap='viridis')
Out[773]:
<matplotlib.collections.PathCollection at 0x7f97a28accd0>
In [775]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_ea.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_ea.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_ea.components_[1])[np.argsort(np.abs(pca_ea.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_ea.columns[4+np.argsort(np.abs(pca_ea.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Economic activity of household reference persons : Total\ Economic activity of Household Reference Persons - Unit : Persons" has a loading of : 0.7438596995146264
The Column "Economic activity of household reference persons : Economically active - Unit : Persons" has a loading of : 0.48608511343248484
The Column "Economic activity of household reference persons : Economically active\ Employee\ Full-time - Unit : Persons" has a loading of : 0.3100336529102649
The Column "Economic activity of household reference persons : Economically inactive - Unit : Persons" has a loading of : 0.2577745860821408
The Column "Economic activity of household reference persons : Economically inactive\ Retired - Unit : Persons" has a loading of : 0.1977437163526562

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Economic activity of household reference persons : Economically inactive - Unit : Persons" has a loading of : 0.5991419120448395
The Column "Economic activity of household reference persons : Economically active - Unit : Persons" has a loading of : -0.44777801804565043
The Column "Economic activity of household reference persons : Economically active\ Employee\ Full-time - Unit : Persons" has a loading of : -0.4477695005457169
The Column "Economic activity of household reference persons : Economically inactive\ Retired - Unit : Persons" has a loading of : 0.41572417318083993
The Column "Economic activity of household reference persons : Total\ Economic activity of Household Reference Persons - Unit : Persons" has a loading of : 0.15136389399918865
In [776]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
ea_num = filtered_ea.select_dtypes(include='number') # selects numeric columns only
ea_num.head()
Out[776]:
Economic activity of household reference persons : Total\ Economic activity of Household Reference Persons - Unit : Persons Economic activity of household reference persons : Economically active - Unit : Persons Economic activity of household reference persons : Economically active\ Employee\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Employee\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed with employees\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed with employees\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed without employees\ Part-time - Unit : Persons Economic activity of household reference persons : Economically active\ Self-employed without employees\ Full-time - Unit : Persons Economic activity of household reference persons : Economically active\ Unemployed - Unit : Persons Economic activity of household reference persons : Economically active\ Full-time student - Unit : Persons Economic activity of household reference persons : Economically inactive - Unit : Persons Economic activity of household reference persons : Economically inactive\ Retired - Unit : Persons Economic activity of household reference persons : Economically inactive\ Student (including full-time students) - Unit : Persons Economic activity of household reference persons : Economically inactive\ Looking after home or family - Unit : Persons Economic activity of household reference persons : Economically inactive\ Long-term sick or disabled - Unit : Persons Economic activity of household reference persons : Economically inactive\ Other - Unit : Persons
11 200727.0 144378.0 15009.0 94016.0 811.0 6958.0 5746.0 17263.0 3442.0 1133.0 56349.0 48310.0 691.0 2300.0 3262.0 1786.0
12 251241.0 178693.0 18660.0 123735.0 725.0 6666.0 5192.0 17084.0 4742.0 1889.0 72548.0 59802.0 2034.0 2885.0 5565.0 2262.0
13 222042.0 141248.0 20349.0 88574.0 790.0 6689.0 4286.0 14627.0 4911.0 1022.0 80794.0 66710.0 648.0 2558.0 8429.0 2449.0
16 180213.0 108380.0 15986.0 64329.0 672.0 5278.0 5068.0 13793.0 2766.0 488.0 71833.0 63692.0 364.0 1956.0 4150.0 1671.0
17 231905.0 143576.0 20818.0 81453.0 872.0 6695.0 7117.0 19946.0 5507.0 1168.0 88329.0 73832.0 925.0 3415.0 7437.0 2720.0
In [777]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_ea = cdist(ea_num, ea_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_ea.shape)
The shape of this matrix is (112, 112)
In [778]:
mds_ea = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_ea = mds_ea.fit_transform(dist_mat_ea)
In [974]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_ea[:, 0], mds_x_ea[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_ea['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_ea[index, 0], mds_x_ea[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');

Data info on England population

The main population base for statistics from the 2011 Census is the usual resident population as at census day, 27 March 2011. Although the population base for enumeration included non-UK born short-term residents, this population is analysed separately and is not included in the main outputs from the 2011 Census.
All statistics, unless specified, are produced using only usual residents of the UK.

For 2011 Census purposes, a usual resident of the UK is anyone who, on census day, was in the UK and had stayed or intended to stay in the UK for a period of 12 months or more, or had a permanent UK address and was outside the UK and intended to be outside the UK for less than 12 months.

For information about the main population base for statistics, how other population sub-groups are counted, and all variable definitions, see information about <a href=http://web.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html" target="_blank">variables and classifications</a>."

In [851]:
df_pop = pd.read_csv("Data on population in England.csv")
df_pop = df_pop.iloc[1: , :]
In [852]:
df_pop
Out[852]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
1 4.0 E92000001 England Countries and Groupings CTRY 650145.0 4.069166 53012456.0 13000000.00 22063368.0 52059931.0 952525.0
2 8.0 E12000001 North East Regions RGN 22618.0 3.029085 2596886.0 857316.90 1129935.0 2550818.0 46068.0
3 9.0 E12000002 North West Regions RGN 76565.0 4.999649 7052177.0 1410535.00 3009549.0 6927820.0 124357.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 54976.0 3.429295 5283733.0 1540764.00 2224059.0 5185677.0 98056.0
5 11.0 E12000004 East Midlands Regions RGN 54416.0 2.904717 4533222.0 1560642.00 1895604.0 4442192.0 91030.0
... ... ... ... ... ... ... ... ... ... ... ... ...
219050 231883.0 E00176770 E00176770 Output Areas and Small Areas OASA 0.0 57.074340 952.0 16.68 57.0 156.0 796.0
219051 231884.0 E00176771 E00176771 Output Areas and Small Areas OASA 22.0 59.199240 621.0 10.49 44.0 124.0 497.0
219052 231885.0 E00176772 E00176772 Output Areas and Small Areas OASA 7.0 24.591740 256.0 10.41 98.0 252.0 4.0
219053 231886.0 E00176773 E00176773 Output Areas and Small Areas OASA 0.0 20.435310 169.0 8.27 107.0 169.0 0.0
219054 231887.0 E00176774 E00176774 Output Areas and Small Areas OASA 0.0 9.874153 102.0 10.33 53.0 102.0 0.0

219054 rows × 12 columns

In [853]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [854]:
filtered_pop = df_pop[df_pop['GEO_CODE'].isin(GeoCodes)]
filtered_pop.head()
Out[854]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 10771.0 3.228751 505283.0 156494.9 200727.0 497299.0 7984.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 9352.0 2.039267 621210.0 304624.2 251241.0 598066.0 23144.0
13 19.0 E10000006 Cumbria Counties CNTY 6584.0 0.738718 499858.0 676655.9 222042.0 490939.0 8919.0
16 22.0 E10000009 Dorset Counties CNTY 6281.0 1.624573 412905.0 254162.2 180213.0 403366.0 9539.0
17 23.0 E10000011 East Sussex Counties CNTY 7111.0 3.082554 526671.0 170855.4 231905.0 515598.0 11073.0

Check for NaN values which is causing an error for our PCA calculation

In [855]:
filtered_pop.isnull().values.any()
Out[855]:
True
In [856]:
filtered_pop.isnull().sum().sum()
Out[856]:
2
In [857]:
count_nan_in_df = filtered_pop.isnull().sum()
print (count_nan_in_df)
CDU_ID                                                                                                                             0
GEO_CODE                                                                                                                           0
GEO_LABEL                                                                                                                          0
GEO_TYPE                                                                                                                           0
GEO_TYP2                                                                                                                           0
Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons    0
Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons                                            2
Population (usual residents) : All usual residents - Unit : Persons                                                                0
Population (usual residents) : Area (Hectares) - Unit : Hectares                                                                   0
Population (usual residents) : All households - Unit : Households                                                                  0
Population (usual residents) : All usual residents in households - Unit : Persons                                                  0
Population (usual residents) : All usual residents in communal establishments - Unit : Persons                                     0
dtype: int64
In [858]:
df133 = filtered_pop[filtered_pop.isna().any(axis=1)]
In [859]:
#The two rows in the dataframes which have NaN values
df133
Out[859]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
368 400.0 E41000052 Cornwall, Isles of Scilly Local Authorities LA 6185.0 NaN 534476.0 356256.0 231378.0 525555.0 8921.0
369 401.0 E41000324 City of London, Westminster Local Authorities LA 3249.0 NaN 226771.0 2438.0 110157.0 220395.0 6376.0

We can fix this by calculating density (Population/Area)

In [860]:
Population_Density_Persons_Cornwall = (534476.0/356256.0)
Population_Density_Persons_Westminster = (226771.0/2438.0)
print(Population_Density_Persons_Cornwall)
print(Population_Density_Persons_Westminster)
1.5002582412647085
93.01517637407711

We know that:

City of London, Westminster index = 110
                &
    Cornwall, Isles of Scilly = 109

Therefore we use the iloc function to add:

  • Population_Density_Persons_Cornwall
  • Population_Density_Persons_Westminster
In [862]:
filtered_pop.update(filtered_pop.iloc[[109]].fillna('1.5002582412647085'))
filtered_pop.update(filtered_pop.iloc[[110]].fillna('93.01517637407711'))
In [863]:
filtered_pop.iloc[109]
Out[863]:
CDU_ID                                                                                                                                                   400
GEO_CODE                                                                                                                                           E41000052
GEO_LABEL                                                                                                                          Cornwall, Isles of Scilly
GEO_TYPE                                                                                                                                   Local Authorities
GEO_TYP2                                                                                                                                                  LA
Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons                         6185
Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons                                                   1.5002582412647085
Population (usual residents) : All usual residents - Unit : Persons                                                                                   534476
Population (usual residents) : Area (Hectares) - Unit : Hectares                                                                                      356256
Population (usual residents) : All households - Unit : Households                                                                                     231378
Population (usual residents) : All usual residents in households - Unit : Persons                                                                     525555
Population (usual residents) : All usual residents in communal establishments - Unit : Persons                                                          8921
Name: 368, dtype: object

Done! Now lets continue

In [864]:
filtered_pop
Out[864]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 10771.0 3.22875 505283.0 156494.90 200727.0 497299.0 7984.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 9352.0 2.03927 621210.0 304624.20 251241.0 598066.0 23144.0
13 19.0 E10000006 Cumbria Counties CNTY 6584.0 0.738718 499858.0 676655.90 222042.0 490939.0 8919.0
16 22.0 E10000009 Dorset Counties CNTY 6281.0 1.62457 412905.0 254162.20 180213.0 403366.0 9539.0
17 23.0 E10000011 East Sussex Counties CNTY 7111.0 3.08255 526671.0 170855.40 231905.0 515598.0 11073.0
... ... ... ... ... ... ... ... ... ... ... ... ...
366 398.0 E09000031 Waltham Forest Local Authorities LA 2149.0 66.535 258249.0 3881.40 96861.0 256615.0 1634.0
367 399.0 E09000032 Wandsworth Local Authorities LA 4236.0 89.5998 306995.0 3426.29 130493.0 301648.0 5347.0
368 400.0 E41000052 Cornwall, Isles of Scilly Local Authorities LA 6185.0 1.5002582412647085 534476.0 356256.00 231378.0 525555.0 8921.0
369 401.0 E41000324 City of London, Westminster Local Authorities LA 3249.0 93.01517637407711 226771.0 2438.00 110157.0 220395.0 6376.0
8048 9937.0 E02000001 City of London 001 Middle Super Output Areas and Intermediate Zones MSOAIZ 78.0 25.4503 7375.0 289.78 4385.0 7187.0 188.0

112 rows × 12 columns

In [975]:
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1025]:
filtered_pop = filtered_pop.drop(['GEO_TYP2'], axis=1)
In [1026]:
filtered_pop = filtered_pop.drop(['GEO_TYPE'], axis=1)
In [1027]:
filtered_pop.head()
Out[1027]:
GEO_CODE GEO_LABEL Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
11 E10000002 Buckinghamshire 10771.0 3.22875 505283.0 156494.9 200727.0 497299.0 7984.0
12 E10000003 Cambridgeshire 9352.0 2.03927 621210.0 304624.2 251241.0 598066.0 23144.0
13 E10000006 Cumbria 6584.0 0.738718 499858.0 676655.9 222042.0 490939.0 8919.0
16 E10000009 Dorset 6281.0 1.62457 412905.0 254162.2 180213.0 403366.0 9539.0
17 E10000011 East Sussex 7111.0 3.08255 526671.0 170855.4 231905.0 515598.0 11073.0
In [865]:
filtered_pop = filtered_pop.drop(['CDU_ID'], axis=1)
In [866]:
# Transform the data into a numpy ndarray
np_array_pop = filtered_pop.values
In [867]:
np_array_pop
Out[867]:
array([['E10000002', 'Buckinghamshire', 'Counties', ..., 200727.0,
        497299.0, 7984.0],
       ['E10000003', 'Cambridgeshire', 'Counties', ..., 251241.0,
        598066.0, 23144.0],
       ['E10000006', 'Cumbria', 'Counties', ..., 222042.0, 490939.0,
        8919.0],
       ...,
       ['E41000052', 'Cornwall, Isles of Scilly', 'Local Authorities',
        ..., 231378.0, 525555.0, 8921.0],
       ['E41000324', 'City of London, Westminster', 'Local Authorities',
        ..., 110157.0, 220395.0, 6376.0],
       ['E02000001', 'City of London 001',
        'Middle Super Output Areas and Intermediate Zones', ..., 4385.0,
        7187.0, 188.0]], dtype=object)
In [868]:
# Removing the first four columns
np_array_pop = np_array_pop[:, 4:]
In [869]:
# Building the PCA model
pca_pop = PCA(n_components=2).fit(np_array_pop) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_pop = pca_pop.transform(np_array_pop) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_pop variable has a shape of :", pca_x_pop.shape)
print(pca_x_pop)
The pca_x_pop variable has a shape of : (112, 2)
[[ 1.19571123e+05  3.59784782e+04]
 [ 2.97223125e+05  1.64490049e+05]
 [ 1.78185664e+05  5.53198267e+05]
 [-3.08412287e+02  1.48784818e+05]
 [ 1.56756448e+05  4.66293198e+04]
 [ 1.43864303e+06  7.27554414e+04]
 [ 2.68940175e+05  1.28395382e+05]
 [ 1.32660757e+06  1.07459573e+05]
 [ 1.00996592e+06 -6.05733830e+04]
 [ 1.53437794e+06  6.95166176e+04]
 [ 1.10224801e+06  5.60709508e+04]
 [ 3.35843480e+05  6.29544093e+04]
 [ 4.76275121e+05  4.33065006e+05]
 [ 6.81181240e+05  3.53800894e+05]
 [ 4.03365307e+05  8.33294990e+04]
 [ 3.30950213e+05  6.63162996e+05]
 [ 5.38719398e+05  3.95372508e+04]
 [ 3.33844658e+05  1.15743534e+05]
 [ 1.78955530e+05  2.19235876e+05]
 [ 6.32624380e+05  8.23551498e+04]
 [ 4.72750509e+05  2.19993279e+05]
 [ 1.02497922e+06 -6.02977258e+04]
 [ 1.86974843e+05  6.96543398e+04]
 [ 5.66677993e+05  2.69006597e+04]
 [ 2.13360754e+05  4.29771061e+04]
 [ 1.38962355e+06 -2.04564117e+05]
 [ 1.34162240e+06 -1.08239361e+05]
 [ 9.88280815e+05 -1.68203867e+05]
 [ 3.34334942e+06 -4.09681729e+05]
 [ 2.62613919e+06 -2.10916241e+05]
 [-4.94716911e+05 -3.98509422e+04]
 [-4.28853451e+05 -5.16478621e+04]
 [-4.29651006e+05 -3.22524692e+04]
 [-3.50185781e+05 -4.57582380e+04]
 [-4.73859935e+05 -3.18458905e+04]
 [-4.45807599e+05 -4.71068323e+04]
 [-3.34305164e+05 -4.99280445e+04]
 [-4.15222870e+05 -4.49743958e+04]
 [-2.55062987e+05 -7.01070791e+04]
 [-1.16857747e+05  1.48834177e+05]
 [-3.94890245e+05 -4.16641511e+04]
 [-3.76275552e+05  2.19661478e+04]
 [-3.42105938e+05 -3.97560942e+04]
 [-2.68496115e+05 -6.80270808e+04]
 [-1.56200375e+05 -8.19280913e+04]
 [-5.73566279e+05 -1.67259754e+03]
 [-1.92892927e+05 -7.70634141e+04]
 [-3.37853303e+05  1.51714974e+05]
 [-3.85390056e+05 -3.30058432e+04]
 [-2.65706603e+05 -6.67005930e+04]
 [-3.72683475e+05 -2.87814222e+04]
 [-7.21253873e+03 -9.52960425e+04]
 [-3.30694045e+05 -3.08421820e+04]
 [-2.43538653e+05 -2.87973592e+04]
 [-2.56980468e+05 -6.90896450e+04]
 [-4.38800630e+05 -4.94423756e+04]
 [-3.62519254e+05 -6.00194369e+04]
 [-4.14695001e+05 -5.21468583e+04]
 [-3.22118414e+05 -4.64083719e+04]
 [-3.59541084e+05 -3.06959822e+04]
 [-3.36495860e+05 -6.37903741e+04]
 [-3.76545428e+05 -5.89375547e+04]
 [-3.99076848e+05 -4.42149724e+04]
 [-2.45498213e+05 -5.93126794e+04]
 [-3.99514418e+05  1.03513164e+04]
 [-2.65420626e+05 -4.52933803e+04]
 [-2.31954684e+05 -7.15826387e+04]
 [-3.33094229e+05 -6.41875270e+04]
 [-2.88761726e+05 -6.83961084e+04]
 [-4.25662691e+05 -1.90488427e+04]
 [ 1.41167881e+05  1.00500435e+05]
 [-1.11057766e+05  4.10424736e+05]
 [-7.68509619e+04  1.92115320e+04]
 [-1.49664806e+05  2.32134740e+05]
 [ 8.79004758e+04  2.10090029e+05]
 [-3.96831670e+05 -1.28924179e+04]
 [-5.17871674e+05  3.31697730e+04]
 [-5.06529005e+05  4.39350728e+04]
 [ 9.16741296e+04 -1.06322904e+05]
 [-3.60750169e+05 -6.16246506e+04]
 [-2.91995816e+05 -6.71367652e+04]
 [-1.81316205e+05 -8.22206406e+04]
 [-1.76771138e+05 -7.14686159e+04]
 [-3.11226512e+05 -6.84416328e+04]
 [-1.01325852e+05 -8.69197336e+04]
 [-1.41527772e+05 -8.55700318e+04]
 [-1.76538011e+05 -7.87736366e+04]
 [-2.61664743e+05 -7.19983733e+04]
 [-2.71190087e+05 -7.36709815e+04]
 [-3.63060687e+05 -6.30206184e+04]
 [-2.59232097e+05 -7.40950345e+04]
 [-2.85581488e+05 -6.91525209e+04]
 [-2.83357241e+05 -6.28726575e+04]
 [-2.36394928e+05 -6.82624791e+04]
 [-2.62409797e+05 -7.12201302e+04]
 [-3.30326960e+05 -6.68830142e+04]
 [-3.96371166e+05 -5.93324021e+04]
 [-3.99013935e+05 -5.68553994e+04]
 [-1.87355250e+05 -8.25965764e+04]
 [-2.27522976e+05 -7.71142037e+04]
 [-3.39914946e+05 -6.38230825e+04]
 [-1.87992080e+05 -8.23320653e+04]
 [-2.27294350e+05 -7.54497905e+04]
 [-3.57415933e+05 -5.95950499e+04]
 [-2.11761931e+05 -7.95684839e+04]
 [-3.52852394e+05 -6.15961823e+04]
 [-2.62022388e+05 -7.47406058e+04]
 [-2.56046421e+05 -7.37013789e+04]
 [-1.83409423e+05 -8.22965441e+04]
 [ 1.90119612e+05  2.29189208e+05]
 [-2.97985477e+05 -6.95148344e+04]
 [-6.19447539e+05 -3.43978011e+04]]
In [184]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_pop[:,0], pca_x_pop[:,1],cmap='viridis')
Out[184]:
<matplotlib.collections.PathCollection at 0x7f97b5ccbca0>
In [185]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_pop.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_pop.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_pop.components_[1])[np.argsort(np.abs(pca_pop.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_pop.columns[4+np.argsort(np.abs(pca_pop.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Population (usual residents) : All usual residents - Unit : Persons" has a loading of : 0.6807788931030756
The Column "Population (usual residents) : All usual residents in households - Unit : Persons" has a loading of : 0.6687903891351726
The Column "Population (usual residents) : All households - Unit : Households" has a loading of : 0.28100845009978864
The Column "Population (usual residents) : Area (Hectares) - Unit : Hectares" has a loading of : 0.10049545626184857
The Column "Population (usual residents) : All usual residents in communal establishments - Unit : Persons" has a loading of : 0.01198850396790283
The Column "Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons" has a loading of : 0.007120677054010499
The Column "Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons" has a loading of : -8.939654139139904e-06

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "Population (usual residents) : Area (Hectares) - Unit : Hectares" has a loading of : 0.9945868427926498
The Column "Population (usual residents) : All usual residents in households - Unit : Persons" has a loading of : -0.07560015117506051
The Column "Population (usual residents) : All usual residents - Unit : Persons" has a loading of : -0.0707709333231844
The Column "Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons" has a loading of : 0.005304367156740468
The Column "Population (usual residents) : All usual residents in communal establishments - Unit : Persons" has a loading of : 0.004829217851876525
The Column "Population (usual residents) : All households - Unit : Households" has a loading of : -0.004651508040496467
The Column "Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons" has a loading of : -0.00010049665489290676
In [870]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
pop_num = filtered_pop.select_dtypes(include='number') # selects numeric columns only
pop_num.head()
Out[870]:
Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
11 10771.0 505283.0 156494.9 200727.0 497299.0 7984.0
12 9352.0 621210.0 304624.2 251241.0 598066.0 23144.0
13 6584.0 499858.0 676655.9 222042.0 490939.0 8919.0
16 6281.0 412905.0 254162.2 180213.0 403366.0 9539.0
17 7111.0 526671.0 170855.4 231905.0 515598.0 11073.0
In [871]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_pop = cdist(pop_num, pop_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_pop.shape)
The shape of this matrix is (112, 112)
In [872]:
mds_pop = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_pop = mds_pop.fit_transform(dist_mat_pop)
In [976]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_pop[:, 0], mds_x_pop[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_pop['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_pop[index, 0], mds_x_pop[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');

Ethnicity Data on United Kingdom

In [874]:
df_ethnic=pd.read_csv("Data on ethnicity in England Geo.csv")
df_ethnic = df_ethnic.iloc[1: , :]
In [875]:
df_ethnic
Out[875]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Ethnic group [E][S][W] : Total\ Ethnic group - Unit : Persons Ethnic group [E][S][W] : White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons Ethnic group [E][S][W] : White\ Irish - Unit : Persons Ethnic group [E][S][W] : White\ Gypsy or Irish Traveller - Unit : Persons Ethnic group [E][S][W] : White\ Other White - Unit : Persons ... Ethnic group [E][S][W] : Asian/Asian British\ Indian - Unit : Persons Ethnic group [E][S][W] : Asian/Asian British\ Pakistani - Unit : Persons Ethnic group [E][S][W] : Asian/Asian British\ Bangladeshi - Unit : Persons Ethnic group [E][S][W] : Asian/Asian British\ Chinese - Unit : Persons Ethnic group [E][S][W] : Asian/Asian British\ Other Asian - Unit : Persons Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ African - Unit : Persons Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Caribbean - Unit : Persons Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Other Black - Unit : Persons Ethnic group [E][S][W] : Other ethnic group\ Arab - Unit : Persons Ethnic group [E][S][W] : Other ethnic group\ Any other ethnic group - Unit : Persons
1 4.0 E92000001 England Countries and Groupings CTRY 53012456.0 42279236.0 517001.0 54895.0 2430010.0 ... 1395702.0 1112282.0 436514.0 379503.0 819402.0 977741.0 591016.0 277857.0 220985.0 327433.0
2 8.0 E12000001 North East Regions RGN 2596886.0 2431423.0 8035.0 1684.0 34425.0 ... 15817.0 19831.0 10972.0 14284.0 13695.0 10982.0 1193.0 1045.0 5850.0 5201.0
3 9.0 E12000002 North West Regions RGN 7052177.0 6141069.0 64930.0 4147.0 151570.0 ... 107353.0 189436.0 45897.0 48049.0 46750.0 59278.0 23131.0 15460.0 24528.0 19688.0
4 10.0 E12000003 Yorkshire and The Humber Regions RGN 5283733.0 4531137.0 26410.0 4378.0 130031.0 ... 69252.0 225892.0 22424.0 28435.0 39961.0 46033.0 23420.0 10892.0 21340.0 19570.0
5 11.0 E12000004 East Midlands Regions RGN 4533222.0 3871146.0 28676.0 3418.0 143116.0 ... 168928.0 48940.0 13258.0 24404.0 37893.0 41768.0 28913.0 10803.0 9746.0 15989.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
219050 231883.0 E00176770 E00176770 Output Areas and Small Areas OASA 952.0 852.0 8.0 6.0 12.0 ... 7.0 10.0 8.0 1.0 0.0 6.0 6.0 7.0 2.0 1.0
219051 231884.0 E00176771 E00176771 Output Areas and Small Areas OASA 621.0 536.0 55.0 1.0 1.0 ... 2.0 0.0 0.0 9.0 1.0 5.0 3.0 1.0 0.0 1.0
219052 231885.0 E00176772 E00176772 Output Areas and Small Areas OASA 256.0 235.0 1.0 0.0 3.0 ... 9.0 0.0 0.0 3.0 1.0 0.0 0.0 1.0 0.0 0.0
219053 231886.0 E00176773 E00176773 Output Areas and Small Areas OASA 169.0 141.0 3.0 0.0 15.0 ... 1.0 0.0 0.0 1.0 2.0 5.0 0.0 0.0 0.0 0.0
219054 231887.0 E00176774 E00176774 Output Areas and Small Areas OASA 102.0 96.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 3.0 0.0 3.0 0.0 0.0 0.0 0.0

219054 rows × 24 columns

In [876]:
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
In [877]:
filtered_ethnic = df_ethnic[df_ethnic['GEO_CODE'].isin(GeoCodes)]
print(filtered_ethnic.head())
    CDU_ID   GEO_CODE        GEO_LABEL  GEO_TYPE GEO_TYP2  \
11    17.0  E10000002  Buckinghamshire  Counties     CNTY   
12    18.0  E10000003   Cambridgeshire  Counties     CNTY   
13    19.0  E10000006          Cumbria  Counties     CNTY   
16    22.0  E10000009           Dorset  Counties     CNTY   
17    23.0  E10000011      East Sussex  Counties     CNTY   

    Ethnic group [E][S][W] : Total\ Ethnic group - Unit : Persons  \
11                                           505283.0               
12                                           621210.0               
13                                           499858.0               
16                                           412905.0               
17                                           526671.0               

    Ethnic group [E][S][W] : White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons  \
11                                           409793.0                                                
12                                           524617.0                                                
13                                           482124.0                                                
16                                           394350.0                                                
17                                           482769.0                                                

    Ethnic group [E][S][W] : White\ Irish - Unit : Persons  \
11                                             5377.0        
12                                             4908.0        
13                                             1552.0        
16                                             1975.0        
17                                             3966.0        

    Ethnic group [E][S][W] : White\ Gypsy or Irish Traveller - Unit : Persons  \
11                                              614.0                           
12                                             1508.0                           
13                                              315.0                           
16                                              555.0                           
17                                              815.0                           

    Ethnic group [E][S][W] : White\ Other White - Unit : Persons  ...  \
11                                            20886.0             ...   
12                                            43954.0             ...   
13                                             8266.0             ...   
16                                             7437.0             ...   
17                                            17872.0             ...   

    Ethnic group [E][S][W] : Asian/Asian British\ Indian - Unit : Persons  \
11                                            11368.0                       
12                                             7430.0                       
13                                              892.0                       
16                                              737.0                       
17                                             2253.0                       

    Ethnic group [E][S][W] : Asian/Asian British\ Pakistani - Unit : Persons  \
11                                            21236.0                          
12                                             2373.0                          
13                                              316.0                          
16                                              151.0                          
17                                              317.0                          

    Ethnic group [E][S][W] : Asian/Asian British\ Bangladeshi - Unit : Persons  \
11                                             1089.0                            
12                                             2562.0                            
13                                              486.0                            
16                                              525.0                            
17                                             1042.0                            

    Ethnic group [E][S][W] : Asian/Asian British\ Chinese - Unit : Persons  \
11                                             2554.0                        
12                                             6723.0                        
13                                             1153.0                        
16                                              943.0                        
17                                             1931.0                        

    Ethnic group [E][S][W] : Asian/Asian British\ Other Asian - Unit : Persons  \
11                                             7022.0                            
12                                             6550.0                            
13                                             1219.0                            
16                                             1477.0                            
17                                             3600.0                            

    Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ African - Unit : Persons  \
11                                             4032.0                                          
12                                             3426.0                                          
13                                              373.0                                          
16                                              518.0                                          
17                                             1800.0                                          

    Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Caribbean - Unit : Persons  \
11                                             5175.0                                            
12                                             1647.0                                            
13                                              141.0                                            
16                                              295.0                                            
17                                              771.0                                            

    Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Other Black - Unit : Persons  \
11                                             1283.0                                              
12                                              937.0                                              
13                                               65.0                                              
16                                              111.0                                              
17                                              341.0                                              

    Ethnic group [E][S][W] : Other ethnic group\ Arab - Unit : Persons  \
11                                              853.0                    
12                                             1370.0                    
13                                              153.0                    
16                                              116.0                    
17                                              638.0                    

    Ethnic group [E][S][W] : Other ethnic group\ Any other ethnic group - Unit : Persons  
11                                             1641.0                                     
12                                             2124.0                                     
13                                              299.0                                     
16                                              315.0                                     
17                                             1083.0                                     

[5 rows x 24 columns]
In [878]:
filtered_ethnic.columns
Out[878]:
Index(['CDU_ID', 'GEO_CODE', 'GEO_LABEL', 'GEO_TYPE', 'GEO_TYP2',
       'Ethnic group [E][S][W] : Total\ Ethnic group - Unit : Persons',
       'Ethnic group [E][S][W] : White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons',
       'Ethnic group [E][S][W] : White\ Irish - Unit : Persons',
       'Ethnic group [E][S][W] : White\ Gypsy or Irish Traveller - Unit : Persons',
       'Ethnic group [E][S][W] : White\ Other White - Unit : Persons',
       'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons',
       'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Black African - Unit : Persons',
       'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Asian - Unit : Persons',
       'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ Other Mixed - Unit : Persons',
       'Ethnic group [E][S][W] : Asian/Asian British\ Indian - Unit : Persons',
       'Ethnic group [E][S][W] : Asian/Asian British\ Pakistani - Unit : Persons',
       'Ethnic group [E][S][W] : Asian/Asian British\ Bangladeshi - Unit : Persons',
       'Ethnic group [E][S][W] : Asian/Asian British\ Chinese - Unit : Persons',
       'Ethnic group [E][S][W] : Asian/Asian British\ Other Asian - Unit : Persons',
       'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ African - Unit : Persons',
       'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Caribbean - Unit : Persons',
       'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Other Black - Unit : Persons',
       'Ethnic group [E][S][W] : Other ethnic group\ Arab - Unit : Persons',
       'Ethnic group [E][S][W] : Other ethnic group\ Any other ethnic group - Unit : Persons'],
      dtype='object')
In [ ]:
 
In [880]:
filtered_ethnic.rename(columns = {'Ethnic group [E][S][W] : Total\ Ethnic group - Unit : Persons':'Total\ Ethnic group - Unit : Persons',
                                'Ethnic group [E][S][W] : White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons':'White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons',
                                'Ethnic group [E][S][W] : White\ Irish - Unit : Persons':'White\ Irish - Unit : Persons',
                                'Ethnic group [E][S][W] : White\ Gypsy or Irish Traveller - Unit : Persons':'White\ Gypsy or Irish Traveller - Unit : Persons',
                                'Ethnic group [E][S][W] : White\ Other White - Unit : Persons':'White\ Other White - Unit : Persons',
                                'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons':'Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons',
                                'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Black African - Unit : Persons':'Mixed/multiple ethnic group\ White and Black African - Unit : Persons',
                                'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Asian - Unit : Persons':'Mixed/multiple ethnic group\ White and Asian - Unit : Persons',
                                'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ Other Mixed - Unit : Persons':'Mixed/multiple ethnic group\ Other Mixed - Unit : Persons',
                                'Ethnic group [E][S][W] : Asian/Asian British\ Indian - Unit : Persons':'Asian/Asian British\ Indian - Unit : Persons',
                                'Ethnic group [E][S][W] : Asian/Asian British\ Pakistani - Unit : Persons':'Asian/Asian British\ Pakistani - Unit : Persons',
                                'Ethnic group [E][S][W] : Asian/Asian British\ Bangladeshi - Unit : Persons':'Asian/Asian British\ Bangladeshi - Unit : Persons',
                                'Ethnic group [E][S][W] : Asian/Asian British\ Chinese - Unit : Persons':'Asian/Asian British\ Chinese - Unit : Persons',
                                'Ethnic group [E][S][W] : Asian/Asian British\ Other Asian - Unit : Persons':'Asian/Asian British\ Other Asian - Unit : Persons',
                                'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ African - Unit : Persons':'Black/African/Caribbean/Black British\ African - Unit : Persons',
                                'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Caribbean - Unit : Persons':'Black/African/Caribbean/Black British\ Caribbean - Unit : Persons',
                                'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Other Black - Unit : Persons':'Black/African/Caribbean/Black British\ Other Black - Unit : Persons',
                                'Ethnic group [E][S][W] : Other ethnic group\ Arab - Unit : Persons':'Number of people classified as Arab ethnic group',
                                'Ethnic group [E][S][W] : Other ethnic group\ Any other ethnic group - Unit : Persons':'Other ethnic group\ Any other ethnic group - Unit : Persons'}, inplace = True)
In [881]:
filtered_ethnic
Out[881]:
CDU_ID GEO_CODE GEO_LABEL GEO_TYPE GEO_TYP2 Total\ Ethnic group - Unit : Persons White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons White\ Irish - Unit : Persons White\ Gypsy or Irish Traveller - Unit : Persons White\ Other White - Unit : Persons ... Asian/Asian British\ Indian - Unit : Persons Asian/Asian British\ Pakistani - Unit : Persons Asian/Asian British\ Bangladeshi - Unit : Persons Asian/Asian British\ Chinese - Unit : Persons Asian/Asian British\ Other Asian - Unit : Persons Black/African/Caribbean/Black British\ African - Unit : Persons Black/African/Caribbean/Black British\ Caribbean - Unit : Persons Black/African/Caribbean/Black British\ Other Black - Unit : Persons Number of people classified as Arab ethnic group Other ethnic group\ Any other ethnic group - Unit : Persons
11 17.0 E10000002 Buckinghamshire Counties CNTY 505283.0 409793.0 5377.0 614.0 20886.0 ... 11368.0 21236.0 1089.0 2554.0 7022.0 4032.0 5175.0 1283.0 853.0 1641.0
12 18.0 E10000003 Cambridgeshire Counties CNTY 621210.0 524617.0 4908.0 1508.0 43954.0 ... 7430.0 2373.0 2562.0 6723.0 6550.0 3426.0 1647.0 937.0 1370.0 2124.0
13 19.0 E10000006 Cumbria Counties CNTY 499858.0 482124.0 1552.0 315.0 8266.0 ... 892.0 316.0 486.0 1153.0 1219.0 373.0 141.0 65.0 153.0 299.0
16 22.0 E10000009 Dorset Counties CNTY 412905.0 394350.0 1975.0 555.0 7437.0 ... 737.0 151.0 525.0 943.0 1477.0 518.0 295.0 111.0 116.0 315.0
17 23.0 E10000011 East Sussex Counties CNTY 526671.0 482769.0 3966.0 815.0 17872.0 ... 2253.0 317.0 1042.0 1931.0 3600.0 1800.0 771.0 341.0 638.0 1083.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
366 398.0 E09000031 Waltham Forest Local Authorities LA 258249.0 92999.0 3959.0 369.0 37472.0 ... 9134.0 26347.0 4632.0 2579.0 11697.0 18815.0 18841.0 7135.0 3776.0 6728.0
367 399.0 E09000032 Wandsworth Local Authorities LA 306995.0 163739.0 7664.0 163.0 47650.0 ... 8642.0 9718.0 1493.0 3715.0 9770.0 14818.0 12297.0 5641.0 2350.0 4094.0
368 400.0 E41000052 Cornwall, Isles of Scilly Local Authorities LA 534476.0 511715.0 2056.0 635.0 10619.0 ... 837.0 107.0 280.0 1004.0 1208.0 293.0 369.0 102.0 189.0 644.0
369 401.0 E41000324 City of London, Westminster Local Authorities LA 226771.0 81577.0 5140.0 79.0 54333.0 ... 7429.0 2344.0 6531.0 6180.0 10318.0 9239.0 4495.0 2931.0 15793.0 8698.0
8048 9937.0 E02000001 City of London 001 Middle Super Output Areas and Intermediate Zones MSOAIZ 7375.0 4243.0 180.0 3.0 1373.0 ... 216.0 16.0 232.0 263.0 213.0 98.0 46.0 49.0 69.0 85.0

112 rows × 24 columns

In [977]:
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
In [1028]:
filtered_ethnic = filtered_ethnic.drop(['GEO_TYPE'], axis=1)
In [1029]:
filtered_ethnic = filtered_ethnic.drop(['GEO_TYP2'], axis=1)
In [ ]:
 
In [882]:
filtered_ethnic = filtered_ethnic.drop(['CDU_ID'], axis=1)
In [883]:
# Transform the data into a numpy ndarray
np_array_ethnic = filtered_ethnic.values
In [884]:
np_array_ethnic
Out[884]:
array([['E10000002', 'Buckinghamshire', 'Counties', ..., 1283.0, 853.0,
        1641.0],
       ['E10000003', 'Cambridgeshire', 'Counties', ..., 937.0, 1370.0,
        2124.0],
       ['E10000006', 'Cumbria', 'Counties', ..., 65.0, 153.0, 299.0],
       ...,
       ['E41000052', 'Cornwall, Isles of Scilly', 'Local Authorities',
        ..., 102.0, 189.0, 644.0],
       ['E41000324', 'City of London, Westminster', 'Local Authorities',
        ..., 2931.0, 15793.0, 8698.0],
       ['E02000001', 'City of London 001',
        'Middle Super Output Areas and Intermediate Zones', ..., 49.0,
        69.0, 85.0]], dtype=object)
In [ ]:
 
In [885]:
# Removing the first four columns
np_array_ethnic = np_array_ethnic[:, 4:]
In [886]:
# Building the PCA model
pca_ethnic = PCA(n_components=2).fit(np_array_ethnic) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_ethnic = pca_ethnic.transform(np_array_ethnic) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able 
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame

# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_ethnic variable has a shape of :", pca_x_ethnic.shape)
print(pca_x_ethnic)
The pca_x_ethnic variable has a shape of : (112, 2)
[[ 1.07613774e+05 -2.76221004e+03]
 [ 2.69833836e+05 -1.95225486e+04]
 [ 1.49018182e+05 -6.78029195e+04]
 [ 2.60304249e+04 -5.61053793e+04]
 [ 1.70112079e+05 -5.04757121e+04]
 [ 1.33692174e+06 -9.37066022e+04]
 [ 2.65045141e+05 -5.37167081e+04]
 [ 1.24313468e+06 -1.00385394e+05]
 [ 8.92076997e+05  1.36131352e+04]
 [ 1.41575298e+06 -7.67571384e+04]
 [ 1.02968140e+06 -6.56153215e+04]
 [ 3.26775027e+05 -4.03410062e+04]
 [ 4.29684716e+05 -6.91038199e+04]
 [ 6.23281810e+05 -7.60034265e+04]
 [ 3.67748887e+05 -2.59220990e+04]
 [ 2.77841950e+05 -6.79774831e+04]
 [ 5.26225986e+05 -7.17593663e+04]
 [ 3.09175195e+05 -1.46632574e+04]
 [ 1.84639896e+05 -6.31288847e+04]
 [ 6.16576569e+05 -8.20831692e+04]
 [ 4.39357930e+05 -5.79567069e+04]
 [ 9.32480242e+05 -1.17259425e+04]
 [ 1.84664634e+05 -3.56436711e+04]
 [ 5.35950920e+05 -4.87308020e+04]
 [ 2.26258740e+05 -5.52819739e+04]
 [ 1.32935673e+06 -1.06733102e+05]
 [ 1.24678077e+06 -5.92866475e+04]
 [ 9.52439425e+05 -8.40929079e+04]
 [ 2.72216397e+06  4.15324849e+05]
 [ 2.28958838e+06  1.07067417e+05]
 [-4.15989631e+05 -2.69729223e+04]
 [-3.60866996e+05 -1.92068543e+04]
 [-3.55322470e+05 -3.24693738e+04]
 [-2.81781557e+05 -3.18073020e+04]
 [-3.99197869e+05 -2.58796208e+04]
 [-3.69428410e+05 -3.02730430e+04]
 [-2.67876950e+05 -3.20328250e+04]
 [-3.66808476e+05  7.65517691e+03]
 [-1.99388441e+05 -2.94257635e+04]
 [-8.11971007e+04 -5.03094151e+04]
 [-3.23531389e+05 -3.22791065e+04]
 [-3.16008425e+05 -2.86407886e+04]
 [-2.77082209e+05 -2.75943868e+04]
 [-2.32043190e+05  2.32825978e+03]
 [-1.93679949e+05  9.77468678e+04]
 [-4.92342448e+05 -2.07401282e+04]
 [-1.80188336e+05  2.96952225e+04]
 [-2.92569005e+05 -3.18972029e+04]
 [-3.20053084e+05 -2.44876424e+04]
 [-2.14278102e+05 -2.13666343e+04]
 [-3.06801452e+05 -2.64468736e+04]
 [-8.83105097e+02  3.83645621e+03]
 [-2.65995006e+05 -3.40915803e+04]
 [-1.87028431e+05 -3.43344963e+04]
 [-1.94106684e+05 -3.62273291e+04]
 [-3.63517283e+05 -2.90672707e+04]
 [-3.04122838e+05 -1.73501767e+04]
 [-3.43292930e+05 -2.70382831e+04]
 [-2.69457952e+05 -1.79606310e+04]
 [-3.18697453e+05  3.70222664e+03]
 [-3.28667590e+05  5.07689544e+04]
 [-3.13395133e+05 -2.18440003e+04]
 [-3.40736544e+05 -1.35025460e+04]
 [-1.96271649e+05 -2.01623061e+04]
 [-3.36385939e+05 -2.54564888e+04]
 [-2.34489149e+05  3.45778086e+03]
 [-1.92619688e+05 -1.04470675e+04]
 [-2.75721278e+05 -1.79347221e+04]
 [-2.43610816e+05 -4.83272087e+03]
 [-3.53466923e+05 -2.97410379e+04]
 [ 1.67954891e+05 -6.97145788e+04]
 [-1.04257258e+05 -5.11471973e+04]
 [-3.75577864e+04 -4.60577778e+04]
 [-1.21474298e+05 -4.58193236e+04]
 [ 9.97290978e+04 -5.33823902e+04]
 [-3.50309821e+05 -6.11205252e+02]
 [-4.44915467e+05 -2.49638926e+04]
 [-4.36154034e+05 -2.50774499e+04]
 [ 3.49772065e+04  8.53096649e+04]
 [-3.41614202e+05  3.56206536e+04]
 [-2.50542872e+05 -4.53586169e+03]
 [-2.67441306e+05  1.55056363e+05]
 [-1.52474821e+05 -1.23148937e+03]
 [-3.11930967e+05  5.26454579e+04]
 [-1.53907421e+05  9.23879536e+04]
 [-2.16371600e+05  1.34371274e+05]
 [-2.22084453e+05  9.24547157e+04]
 [-2.62609960e+05  4.76790573e+04]
 [-2.97105404e+05  7.69918407e+04]
 [-3.50630496e+05  3.93277459e+04]
 [-2.90676746e+05  8.36029721e+04]
 [-3.11613007e+05  9.24550604e+04]
 [-2.34760481e+05 -1.59074334e+04]
 [-2.40942923e+05  5.69847358e+04]
 [-2.85781806e+05  8.35609199e+04]
 [-3.22066318e+05  4.17045612e+04]
 [-3.81496809e+05  3.92077706e+04]
 [-3.55741647e+05  9.65147742e+03]
 [-2.34713436e+05  9.16371737e+04]
 [-2.58225868e+05  7.58518197e+04]
 [-3.27860462e+05  4.09078084e+04]
 [-2.72661876e+05  1.56625230e+05]
 [-2.66306139e+05  1.01725508e+05]
 [-3.14210538e+05  1.54648597e+03]
 [-2.48605637e+05  8.43862907e+04]
 [-3.11018520e+05  2.78704368e+03]
 [-2.97491291e+05  8.81330834e+04]
 [-2.84736422e+05  8.54572338e+04]
 [-2.02308588e+05  5.95255894e+04]
 [ 1.94559194e+05 -6.83944065e+04]
 [-3.16713746e+05  6.92100208e+04]
 [-5.35217097e+05 -1.60541108e+04]]
In [887]:
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_ethnic[:,0], pca_x_ethnic[:,1],cmap='viridis')
Out[887]:
<matplotlib.collections.PathCollection at 0x7f97b3d79d00>
In [888]:
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_ethnic.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_ethnic.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)

# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_ethnic.components_[1])[np.argsort(np.abs(pca_ethnic.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_ethnic.columns[4+np.argsort(np.abs(pca_ethnic.components_[1]))][::-1][:10]

# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(10):
    print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))

print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(10):
    print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
First Principal Component : 10 Highest Loadings (in magnitude)

The Column "Total\ Ethnic group - Unit : Persons" has a loading of : 0.7623846934636165
The Column "White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons" has a loading of : 0.6458083191628136
The Column "Asian/Asian British\ Pakistani - Unit : Persons" has a loading of : 0.029822459300318606
The Column "Asian/Asian British\ Indian - Unit : Persons" has a loading of : 0.01837128051698193
The Column "White\ Other White - Unit : Persons" has a loading of : 0.014613384216660473
The Column "Asian/Asian British\ Other Asian - Unit : Persons" has a loading of : 0.006891992223282057
The Column "Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons" has a loading of : 0.006752280512700077
The Column "White\ Irish - Unit : Persons" has a loading of : 0.006090649978823936
The Column "Black/African/Caribbean/Black British\ Caribbean - Unit : Persons" has a loading of : 0.005930007325027236
The Column "Black/African/Caribbean/Black British\ African - Unit : Persons" has a loading of : 0.004666183233139977

Second Principal Component : 10 Highest Loadings (in magnitude)

The Column "White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons" has a loading of : -0.7029753771624982
The Column "Total\ Ethnic group - Unit : Persons" has a loading of : 0.5741441487274859
The Column "Asian/Asian British\ Indian - Unit : Persons" has a loading of : 0.2369642564572235
The Column "Asian/Asian British\ Pakistani - Unit : Persons" has a loading of : 0.22554470988200828
The Column "White\ Other White - Unit : Persons" has a loading of : 0.13012912611417335
The Column "Black/African/Caribbean/Black British\ African - Unit : Persons" has a loading of : 0.12038157206823398
The Column "Black/African/Caribbean/Black British\ Caribbean - Unit : Persons" has a loading of : 0.11489542346391608
The Column "Asian/Asian British\ Other Asian - Unit : Persons" has a loading of : 0.09223906462525204
The Column "Asian/Asian British\ Bangladeshi - Unit : Persons" has a loading of : 0.07365485585874772
The Column "Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons" has a loading of : 0.05134282020481788
In [889]:
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
ethnic_num = filtered_ethnic.select_dtypes(include='number') # selects numeric columns only
ethnic_num.head()
Out[889]:
Total\ Ethnic group - Unit : Persons White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons White\ Irish - Unit : Persons White\ Gypsy or Irish Traveller - Unit : Persons White\ Other White - Unit : Persons Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons Mixed/multiple ethnic group\ White and Black African - Unit : Persons Mixed/multiple ethnic group\ White and Asian - Unit : Persons Mixed/multiple ethnic group\ Other Mixed - Unit : Persons Asian/Asian British\ Indian - Unit : Persons Asian/Asian British\ Pakistani - Unit : Persons Asian/Asian British\ Bangladeshi - Unit : Persons Asian/Asian British\ Chinese - Unit : Persons Asian/Asian British\ Other Asian - Unit : Persons Black/African/Caribbean/Black British\ African - Unit : Persons Black/African/Caribbean/Black British\ Caribbean - Unit : Persons Black/African/Caribbean/Black British\ Other Black - Unit : Persons Number of people classified as Arab ethnic group Other ethnic group\ Any other ethnic group - Unit : Persons
11 505283.0 409793.0 5377.0 614.0 20886.0 4573.0 1098.0 4125.0 2564.0 11368.0 21236.0 1089.0 2554.0 7022.0 4032.0 5175.0 1283.0 853.0 1641.0
12 621210.0 524617.0 4908.0 1508.0 43954.0 2510.0 1385.0 3895.0 3291.0 7430.0 2373.0 2562.0 6723.0 6550.0 3426.0 1647.0 937.0 1370.0 2124.0
13 499858.0 482124.0 1552.0 315.0 8266.0 751.0 314.0 881.0 558.0 892.0 316.0 486.0 1153.0 1219.0 373.0 141.0 65.0 153.0 299.0
16 412905.0 394350.0 1975.0 555.0 7437.0 952.0 431.0 1212.0 805.0 737.0 151.0 525.0 943.0 1477.0 518.0 295.0 111.0 116.0 315.0
17 526671.0 482769.0 3966.0 815.0 17872.0 1947.0 1023.0 2584.0 1919.0 2253.0 317.0 1042.0 1931.0 3600.0 1800.0 771.0 341.0 638.0 1083.0
In [890]:
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_ethnic = cdist(ethnic_num, ethnic_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_ethnic.shape)
The shape of this matrix is (112, 112)
In [891]:
mds_ethnic = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_ethnic = mds_ethnic.fit_transform(dist_mat_ethnic)
In [978]:
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_ethnic[:, 0], mds_x_ethnic[:, 1], alpha = .3, s=100)

# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_ethnic['GEO_LABEL']):
    plt.annotate(s=name, xy=(mds_x_ethnic[index, 0], mds_x_ethnic[index, 1]))

plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
In [ ]:
 

Merging Datasets & Save datasets

Review:

Gender = filtered_Gender
Vehicle accessibility = filtered_vehi
Religion = filtered_reli
health = filtered_health
deprivation = filtered_dep
unpaid carers = filtered_uncar
economic activity = filtered_ea 
population = filtered_pop
ethnicity = filtered_ethnic

  • Gender - (filtered_Gender)
  • Vehicle Accessiblity - (filtered_vehi)
  • religion - (filtered_reli)
  • health - (filtered_health)
  • deprivation - (filtered_dep)
  • unpaid carers - (filtered_uncar)
  • economic activity - (filtered_ea)
  • Population - (filtered_pop)
  • ethnic - (filtered_ethnic)

Lets check all shapes of dataframe

In [1080]:
# saving the filtered_gender
filtered_Gender.to_csv('filtered_gender_england_dataset.csv', header=True, index=False)
filtered_vehi.to_csv('filtered_vehicle_access_england_dataset.csv', header=True, index=False)
filtered_reli.to_csv('filtered_religion_england_dataset.csv', header=True, index=False)
filtered_health.to_csv('filtered_health_england_dataset.csv', header=True, index=False)
filtered_dep.to_csv('filtered_deprivation_england_dataset.csv', header=True, index=False)
filtered_uncar.to_csv('filtered_unpaid_carers_england_dataset.csv', header=True, index=False)
filtered_ea.to_csv('filtered_econ_activity_england_dataset.csv', header=True, index=False)
filtered_pop.to_csv('filtered_population_england_dataset.csv', header=True, index=False)
filtered_ethnic.to_csv('filtered_ethnic_england_dataset.csv', header=True, index=False)
In [908]:
filtered_Gender.shape
Out[908]:
(112, 6)
In [909]:
filtered_vehi.shape
Out[909]:
(112, 11)
In [910]:
filtered_reli.shape
Out[910]:
(112, 14)
In [911]:
filtered_health.shape
Out[911]:
(112, 9)
In [912]:
filtered_dep.shape
Out[912]:
(112, 10)
In [913]:
filtered_uncar.shape
Out[913]:
(112, 9)
In [914]:
filtered_ea.shape
Out[914]:
(112, 20)
In [916]:
filtered_pop.shape
Out[916]:
(112, 11)
In [917]:
filtered_ethnic.shape
Out[917]:
(112, 23)
In [ ]:
 
In [ ]:
#Data on crime commited in the city of London between 03-06(lockdown) merged
df_crime_all=[crime_oct2019,crime_Nov2019,crime_dec2019,crime_jan2020,crime_feb2020,crime_march2020,crime_april2020,crime_may2020,crime_june2020,crime_july2020,crime_august2020,crime_Sep2020,crime_oct2020];
df_crime= pd.concat(df_crime_all , axis=0, join='outer',ignore_index=True, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)
In [ ]:
df_all123 = pd.concat(df_all,axis=0,join='outer',ignore_index=True,keys=None,levels=None,names=None,verify_integrity=True,copy=False)
In [1051]:
df_test=pd.concat(df_all,axis=1)
In [1057]:
df_genvehi=pd.merge(filtered_Gender,filtered_vehi)

#df_all=[filtered_Gender,filtered_vehi,filtered_reli,filtered_health,filtered_dep,filtered_uncar,filtered_ea,filtered_pop,filtered_ethnic];
In [1060]:
df_genvehi.head()
Out[1060]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles Number of households which have accessibilty to 3 vehicles Number of households which have access to 4 or more vehicles
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 18882.0 7865.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 17830.0 6641.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 12825.0 4452.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 14024.0 5437.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 14750.0 5968.0
In [1058]:
df_3=pd.merge(df_genvehi,filtered_reli)
In [1059]:
df_3.head()
Out[1059]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Religion [E][S][W] : Total\ Religion - Unit : Persons Religion [E][S][W] : Christian - Unit : Persons Religion [E][S][W] : Buddhist - Unit : Persons Religion [E][S][W] : Hindu - Unit : Persons Religion [E][S][W] : Jewish - Unit : Persons Religion [E][S][W] : Muslim - Unit : Persons Religion [E][S][W] : Sikh - Unit : Persons Religion [E][S][W] : Other religion - Unit : Persons Religion [E][S][W] : No religion - Unit : Persons Religion [E][S][W] : Religion not stated - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 505283.0 305804.0 2207.0 6244.0 1511.0 25781.0 4657.0 1803.0 121190.0 36086.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 621210.0 361532.0 3264.0 4142.0 1652.0 8990.0 895.0 2636.0 189016.0 49083.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 499858.0 359235.0 1353.0 559.0 203.0 1336.0 64.0 1364.0 101496.0 34248.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 412905.0 269737.0 1280.0 550.0 519.0 1318.0 88.0 2230.0 104221.0 32962.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 526671.0 315659.0 2190.0 1501.0 1074.0 4201.0 178.0 3508.0 155723.0 42637.0

5 rows × 22 columns

In [1062]:
df_4=pd.merge(df_3,filtered_health)
In [1063]:
df_4.head()
Out[1063]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Religion [E][S][W] : Muslim - Unit : Persons Religion [E][S][W] : Sikh - Unit : Persons Religion [E][S][W] : Other religion - Unit : Persons Religion [E][S][W] : No religion - Unit : Persons Religion [E][S][W] : Religion not stated - Unit : Persons General health : Very bad health - Unit : Persons General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 25781.0 4657.0 1803.0 121190.0 36086.0 3902.0 262901.0 170886.0 53987.0 13607.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 8990.0 895.0 2636.0 189016.0 49083.0 5453.0 306910.0 215746.0 73386.0 19715.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 1336.0 64.0 1364.0 101496.0 34248.0 6481.0 225018.0 172789.0 71966.0 23604.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 1318.0 88.0 2230.0 104221.0 32962.0 4467.0 184353.0 148166.0 59671.0 16248.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 4201.0 178.0 3508.0 155723.0 42637.0 6886.0 230697.0 187695.0 77795.0 23598.0

5 rows × 27 columns

In [1064]:
df_5=pd.merge(df_4,filtered_dep)
In [1065]:
df_5.head()
Out[1065]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... General health : Very good health - Unit : Persons General health : Good health - Unit : Persons General health : Fair health - Unit : Persons General health : Bad health - Unit : Persons Total households examined using deprivation indexes Number of households which are not deprived in any of the scrutinised dimensions Number of households which are deprived by 1 of the scrutinised dimensions Number of households which are deprived by 2 of the scrutinised dimensions Number of households which are deprived by 3 of the scrutinised dimensions Number of households which are deprived by 4 of the scrutinised dimensions
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 262901.0 170886.0 53987.0 13607.0 200727.0 106894.0 61449.0 26862.0 5007.0 515.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 306910.0 215746.0 73386.0 19715.0 251241.0 123285.0 79821.0 39389.0 8067.0 679.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 225018.0 172789.0 71966.0 23604.0 222042.0 96017.0 72142.0 41985.0 10994.0 904.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 184353.0 148166.0 59671.0 16248.0 180213.0 80487.0 62366.0 31001.0 5823.0 536.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 230697.0 187695.0 77795.0 23598.0 231905.0 98156.0 78965.0 43629.0 10054.0 1101.0

5 rows × 33 columns

In [1066]:
df_6=pd.merge(df_5,filtered_ea)
In [1067]:
df_6.head()
Out[1067]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Number of households which are not deprived in any of the scrutinised dimensions Number of households which are deprived by 1 of the scrutinised dimensions Number of households which are deprived by 2 of the scrutinised dimensions Number of households which are deprived by 3 of the scrutinised dimensions Number of households which are deprived by 4 of the scrutinised dimensions Care (unpaid); provision of : Provides no unpaid care - Unit : Persons Care (unpaid); provision of : Provides 1 to 19 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 106894.0 61449.0 26862.0 5007.0 515.0 455769.0 35820.0 5268.0 8426.0 505283.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 123285.0 79821.0 39389.0 8067.0 679.0 561034.0 41313.0 6785.0 12078.0 621210.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 96017.0 72142.0 41985.0 10994.0 904.0 443363.0 35927.0 7265.0 13303.0 499858.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 80487.0 62366.0 31001.0 5823.0 536.0 363583.0 33362.0 5456.0 10504.0 412905.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 98156.0 78965.0 43629.0 10054.0 1101.0 467262.0 39537.0 6745.0 13127.0 526671.0

5 rows × 38 columns

In [1068]:
df_7=pd.merge(df_6,filtered_pop)
In [1069]:
df_7.head()
Out[1069]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 5268.0 8426.0 505283.0 10771.0 3.22875 505283.0 156494.9 200727.0 497299.0 7984.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 6785.0 12078.0 621210.0 9352.0 2.03927 621210.0 304624.2 251241.0 598066.0 23144.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 7265.0 13303.0 499858.0 6584.0 0.738718 499858.0 676655.9 222042.0 490939.0 8919.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 5456.0 10504.0 412905.0 6281.0 1.62457 412905.0 254162.2 180213.0 403366.0 9539.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 6745.0 13127.0 526671.0 7111.0 3.08255 526671.0 170855.4 231905.0 515598.0 11073.0

5 rows × 45 columns

In [1070]:
df_8=pd.merge(df_7,filtered_uncar)
In [1071]:
df_8.head()
Out[1071]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Care (unpaid); provision of : Provides 20 to 49 hours unpaid care a week - Unit : Persons Care (unpaid); provision of : Provides 50 or more hours unpaid care a week - Unit : Persons Care (unpaid); provision of : All categories\ Provision of unpaid care - Unit : Persons Population (usual residents) : Schoolchild or full-time student aged 4 and over at their non-term-time address - Unit : Persons Population (usual residents) : Density (number of persons per Hectare) - Unit : Persons Population (usual residents) : All usual residents - Unit : Persons Population (usual residents) : Area (Hectares) - Unit : Hectares Population (usual residents) : All households - Unit : Households Population (usual residents) : All usual residents in households - Unit : Persons Population (usual residents) : All usual residents in communal establishments - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 5268.0 8426.0 505283.0 10771.0 3.22875 505283.0 156494.9 200727.0 497299.0 7984.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 6785.0 12078.0 621210.0 9352.0 2.03927 621210.0 304624.2 251241.0 598066.0 23144.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 7265.0 13303.0 499858.0 6584.0 0.738718 499858.0 676655.9 222042.0 490939.0 8919.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 5456.0 10504.0 412905.0 6281.0 1.62457 412905.0 254162.2 180213.0 403366.0 9539.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 6745.0 13127.0 526671.0 7111.0 3.08255 526671.0 170855.4 231905.0 515598.0 11073.0

5 rows × 45 columns

In [1072]:
df_9=pd.merge(df_8,filtered_ethnic)
In [1073]:
df_9
Out[1073]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Asian/Asian British\ Indian - Unit : Persons Asian/Asian British\ Pakistani - Unit : Persons Asian/Asian British\ Bangladeshi - Unit : Persons Asian/Asian British\ Chinese - Unit : Persons Asian/Asian British\ Other Asian - Unit : Persons Black/African/Caribbean/Black British\ African - Unit : Persons Black/African/Caribbean/Black British\ Caribbean - Unit : Persons Black/African/Caribbean/Black British\ Other Black - Unit : Persons Number of people classified as Arab ethnic group Other ethnic group\ Any other ethnic group - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 11368.0 21236.0 1089.0 2554.0 7022.0 4032.0 5175.0 1283.0 853.0 1641.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 7430.0 2373.0 2562.0 6723.0 6550.0 3426.0 1647.0 937.0 1370.0 2124.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 892.0 316.0 486.0 1153.0 1219.0 373.0 141.0 65.0 153.0 299.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 737.0 151.0 525.0 943.0 1477.0 518.0 295.0 111.0 116.0 315.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 2253.0 317.0 1042.0 1931.0 3600.0 1800.0 771.0 341.0 638.0 1083.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107 E09000031 Waltham Forest 258249.0 128970.0 129279.0 96861.0 76217.0 40583.0 40732.0 12225.0 ... 9134.0 26347.0 4632.0 2579.0 11697.0 18815.0 18841.0 7135.0 3776.0 6728.0
108 E09000032 Wandsworth 306995.0 148646.0 158349.0 130493.0 89513.0 59143.0 56409.0 12634.0 ... 8642.0 9718.0 1493.0 3715.0 9770.0 14818.0 12297.0 5641.0 2350.0 4094.0
109 E41000052 Cornwall 534476.0 258907.0 275569.0 231378.0 311157.0 40429.0 103219.0 65012.0 ... 837.0 107.0 280.0 1004.0 1208.0 293.0 369.0 102.0 189.0 644.0
110 E41000324 Westminster 226771.0 115639.0 111132.0 110157.0 50502.0 69574.0 32969.0 6103.0 ... 7429.0 2344.0 6531.0 6180.0 10318.0 9239.0 4495.0 2931.0 15793.0 8698.0
111 E02000001 City of London 7375.0 4091.0 3284.0 4385.0 1692.0 3043.0 1100.0 173.0 ... 216.0 16.0 232.0 263.0 213.0 98.0 46.0 49.0 69.0 85.0

112 rows × 64 columns

In [1124]:
Key = df_9[df_9['GEO_CODE']=='E02000001']
In [1125]:
Key
Out[1125]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Asian/Asian British\ Indian - Unit : Persons Asian/Asian British\ Pakistani - Unit : Persons Asian/Asian British\ Bangladeshi - Unit : Persons Asian/Asian British\ Chinese - Unit : Persons Asian/Asian British\ Other Asian - Unit : Persons Black/African/Caribbean/Black British\ African - Unit : Persons Black/African/Caribbean/Black British\ Caribbean - Unit : Persons Black/African/Caribbean/Black British\ Other Black - Unit : Persons Number of people classified as Arab ethnic group Other ethnic group\ Any other ethnic group - Unit : Persons
111 E02000001 London 7375.0 4091.0 3284.0 4385.0 1692.0 3043.0 1100.0 173.0 ... 216.0 16.0 232.0 263.0 213.0 98.0 46.0 49.0 69.0 85.0

1 rows × 64 columns

In [ ]:
 
In [1130]:
df_9.isnull().values.any()
Out[1130]:
False
In [1131]:
#last two changes needed to be made 
df_9.loc[df_9.GEO_CODE == 'E06000055', 'GEO_LABEL'] = "Bedfordshire"
df_9.loc[df_9.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "London"
df_9.loc[df_9.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stoke-on-Trent"
In [1132]:
df_9.to_csv('Combined_England_dataset_Engineered.csv', header=True, index=False)
In [7]:
Load=pd.read_csv('Combined_England_dataset_Engineered.csv') 
In [8]:
Load
Out[8]:
GEO_CODE GEO_LABEL Sex : Total\ Sex - Unit : Persons Sex : Males - Unit : Persons Sex : Females - Unit : Persons Total Number of households which have access to a Vehicle Total number of cars which are privately accessible to citizens(Owned/leased) Total Number of households which do not have access to vehicles Number of households which have access to 1 vehicle Number of households which have access to 2 vehicles ... Asian/Asian British\ Indian - Unit : Persons Asian/Asian British\ Pakistani - Unit : Persons Asian/Asian British\ Bangladeshi - Unit : Persons Asian/Asian British\ Chinese - Unit : Persons Asian/Asian British\ Other Asian - Unit : Persons Black/African/Caribbean/Black British\ African - Unit : Persons Black/African/Caribbean/Black British\ Caribbean - Unit : Persons Black/African/Caribbean/Black British\ Other Black - Unit : Persons Number of people classified as Arab ethnic group Other ethnic group\ Any other ethnic group - Unit : Persons
0 E10000002 Buckinghamshire 505283.0 248346.0 256937.0 200727.0 314138.0 25261.0 75300.0 73419.0 ... 11368.0 21236.0 1089.0 2554.0 7022.0 4032.0 5175.0 1283.0 853.0 1641.0
1 E10000003 Cambridgeshire 621210.0 309560.0 311650.0 251241.0 343690.0 43588.0 106212.0 76970.0 ... 7430.0 2373.0 2562.0 6723.0 6550.0 3426.0 1647.0 937.0 1370.0 2124.0
2 E10000006 Cumbria 499858.0 246065.0 253793.0 222042.0 273534.0 47578.0 99389.0 57798.0 ... 892.0 316.0 486.0 1153.0 1219.0 373.0 141.0 65.0 153.0 299.0
3 E10000009 Dorset 412905.0 201271.0 211634.0 180213.0 253649.0 28021.0 78377.0 54354.0 ... 737.0 151.0 525.0 943.0 1477.0 518.0 295.0 111.0 116.0 315.0
4 E10000011 East Sussex 526671.0 253764.0 272907.0 231905.0 292118.0 50674.0 100340.0 60173.0 ... 2253.0 317.0 1042.0 1931.0 3600.0 1800.0 771.0 341.0 638.0 1083.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107 E09000031 Waltham Forest 258249.0 128970.0 129279.0 96861.0 76217.0 40583.0 40732.0 12225.0 ... 9134.0 26347.0 4632.0 2579.0 11697.0 18815.0 18841.0 7135.0 3776.0 6728.0
108 E09000032 Wandsworth 306995.0 148646.0 158349.0 130493.0 89513.0 59143.0 56409.0 12634.0 ... 8642.0 9718.0 1493.0 3715.0 9770.0 14818.0 12297.0 5641.0 2350.0 4094.0
109 E41000052 Cornwall 534476.0 258907.0 275569.0 231378.0 311157.0 40429.0 103219.0 65012.0 ... 837.0 107.0 280.0 1004.0 1208.0 293.0 369.0 102.0 189.0 644.0
110 E41000324 Westminster 226771.0 115639.0 111132.0 110157.0 50502.0 69574.0 32969.0 6103.0 ... 7429.0 2344.0 6531.0 6180.0 10318.0 9239.0 4495.0 2931.0 15793.0 8698.0
111 E02000001 London 7375.0 4091.0 3284.0 4385.0 1692.0 3043.0 1100.0 173.0 ... 216.0 16.0 232.0 263.0 213.0 98.0 46.0 49.0 69.0 85.0

112 rows × 64 columns

In [ ]:
 
In [ ]:
 
In [ ]: